Spaces:
Running
Running
Commit
·
a6d507f
1
Parent(s):
45c0614
added leaderboards
Browse files- app.py +112 -9
- data/bosc_zh_mc.csv +21 -0
- data/dfcdata_zh_mc.csv +21 -0
- data/gtja_zh_mc.csv +20 -0
- data/huaweicloud_zh_mc.csv +21 -0
- data/lenovo_en_mc.csv +9 -0
- data/lenovo_zh_mc.csv +15 -0
- data/network_en_mc.csv +28 -0
- data/network_en_qa.csv +11 -0
- data/network_zh_mc.csv +31 -0
- data/network_zh_qa.csv +10 -0
- data/oracle_en_mc.csv +28 -0
- data/oracle_zh_mc.csv +28 -0
- data/pufa_zh_mc.csv +21 -0
- data/rzy_zh_mc.csv +21 -0
- data/tencent_zh_qa.csv +14 -0
- data/zabbix_zh_mc.csv +21 -0
- data/zjyd_zh_mc.csv +28 -0
- data/zjyd_zh_qa.csv +11 -0
- data/zte_en_mc.csv +26 -0
- data/zte_zh_mc.csv +28 -0
- df_process.ipynb +743 -0
- leaderboards.py +22 -0
- opseval_datasets.py +63 -0
app.py
CHANGED
@@ -9,23 +9,124 @@ import matplotlib.pyplot as plt
|
|
9 |
import plotly.graph_objects as go
|
10 |
from apscheduler.schedulers.background import BackgroundScheduler
|
11 |
from texts import INTRODUCTION_TEXT, TITLE
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
}
|
17 |
|
18 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
headers = df.columns
|
20 |
-
types = ["str"] + ["number"] * (len(headers) - 1)
|
21 |
|
22 |
return gr.components.Dataframe(
|
23 |
value=df.values.tolist(),
|
24 |
-
headers=[
|
25 |
-
datatype=types,
|
26 |
# max_rows=10,
|
27 |
)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def launch_gradio():
|
31 |
demo = gr.Blocks()
|
@@ -33,12 +134,14 @@ def launch_gradio():
|
|
33 |
with demo:
|
34 |
gr.HTML(TITLE)
|
35 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
36 |
-
for key,
|
37 |
with gr.Tab(key):
|
38 |
-
create_lang_leader_board(
|
39 |
|
40 |
demo.launch()
|
41 |
|
|
|
|
|
42 |
scheduler = BackgroundScheduler()
|
43 |
scheduler.add_job(launch_gradio, 'interval', hours=1)
|
44 |
scheduler.start()
|
|
|
9 |
import plotly.graph_objects as go
|
10 |
from apscheduler.schedulers.background import BackgroundScheduler
|
11 |
from texts import INTRODUCTION_TEXT, TITLE
|
12 |
+
from leaderboards import eng_leaderboards, chi_leaderboards
|
13 |
+
from opseval_datasets import *
|
14 |
|
15 |
+
|
16 |
+
# df_lang = {
|
17 |
+
# 'English': pd.read_csv("./leaderboard/wired_network_en.csv"),
|
18 |
+
# 'Chinese': pd.read_csv("./leaderboard/wired_network_zh.csv"),
|
19 |
+
# }
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
def create_lang_tabs(lang, lang_cates):
|
24 |
+
df_dict = {}
|
25 |
+
for dataset, cates in lang_cates:
|
26 |
+
dataset_dt = {}
|
27 |
+
for cat in cates:
|
28 |
+
leaderboard_df = pd.read_csv(f'./data/{dataset}_{lang}_{cat}.csv')
|
29 |
+
dataset_dt[cat] = leaderboard_df
|
30 |
+
df_dict[dataset] = dataset_dt
|
31 |
+
return df_dict
|
32 |
+
|
33 |
+
|
34 |
+
dict_lang = {
|
35 |
+
'English': create_lang_tabs('en', eng_leaderboards),
|
36 |
+
'Chinese': create_lang_tabs('zh', chi_leaderboards)
|
37 |
}
|
38 |
|
39 |
+
def process_mc_df(df, shot=None):
|
40 |
+
# 将name列重命名为Model
|
41 |
+
df = df.rename(columns={"name": "Model"})
|
42 |
+
# 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency
|
43 |
+
df = df.set_index("Model")
|
44 |
+
# df = df.stack().unstack()
|
45 |
+
df.columns = pd.MultiIndex.from_tuples([("Zeroshot", "Naive"), ("Zeroshot", "SC"), ("Zeroshot", "CoT"), ("Zeroshot", "CoT+SC"), ("Fewshot", "Naive"), ("Fewshot", "SC"), ("Fewshot", "CoT"), ("Fewshot", "CoT+SC")])
|
46 |
+
# 保留shot的列,比如如果shot=Zeroshot那么只有Zeroshot的列会被保留
|
47 |
+
if shot:
|
48 |
+
df = df[shot]
|
49 |
+
# 将除了Model列之外的列的value转换为数值型,失败的为NaN
|
50 |
+
df = df.apply(pd.to_numeric, errors="coerce")
|
51 |
+
# 保留小数点后两位
|
52 |
+
df = df.round(2)
|
53 |
+
# 给每一行添加一列BestScore
|
54 |
+
df["BestScore"] = df.max(axis=1)
|
55 |
+
# 根据BestScore给df排序
|
56 |
+
df = df.sort_values(by="BestScore", ascending=False)
|
57 |
+
# reset_index
|
58 |
+
df = df.reset_index()
|
59 |
+
return df
|
60 |
+
|
61 |
+
def dataframe_to_gradio(df, is_mc=True, shot=None):
|
62 |
+
|
63 |
+
if is_mc:
|
64 |
+
df = process_mc_df(df, shot)
|
65 |
headers = df.columns
|
66 |
+
# types = ["str"] + ["number"] * (len(headers) - 1)
|
67 |
|
68 |
return gr.components.Dataframe(
|
69 |
value=df.values.tolist(),
|
70 |
+
headers=[label for label in df.columns],
|
71 |
+
# datatype=types,
|
72 |
# max_rows=10,
|
73 |
)
|
74 |
|
75 |
+
def plot_radar_chart(df, attributes):
|
76 |
+
fig = go.Figure()
|
77 |
+
|
78 |
+
for index, row in df.iterrows():
|
79 |
+
model = row['Model']
|
80 |
+
values = row[attributes].tolist()
|
81 |
+
fig.add_trace(go.Scatterpolar(
|
82 |
+
r=values,
|
83 |
+
theta=attributes,
|
84 |
+
fill='toself',
|
85 |
+
name=model
|
86 |
+
))
|
87 |
+
|
88 |
+
fig.update_layout(
|
89 |
+
title="OpsEval",
|
90 |
+
polar=dict(
|
91 |
+
radialaxis=dict(
|
92 |
+
visible=True,
|
93 |
+
range=[0, 0.9]
|
94 |
+
)),
|
95 |
+
showlegend=True
|
96 |
+
)
|
97 |
+
|
98 |
+
return fig
|
99 |
+
|
100 |
+
|
101 |
+
def create_lang_leader_board(lang_dict):
|
102 |
+
|
103 |
+
best_scores = {}
|
104 |
+
best_plot_datasets = []
|
105 |
+
for dataset, value in lang_dict.items():
|
106 |
+
for cat, df in value.items():
|
107 |
+
if cat == 'mc':
|
108 |
+
processed = process_mc_df(df)
|
109 |
+
bestscores = processed['BestScore']
|
110 |
+
best_scores[dataset] = bestscores
|
111 |
+
best_plot_datasets.append(dataset)
|
112 |
+
best_df = pd.DataFrame(best_scores)
|
113 |
+
# print(best_scores)
|
114 |
+
# print(best_df)
|
115 |
+
# plot = plot_radar_chart(pd.DataFrame(best_scores), best_plot_datasets)
|
116 |
+
# gr.Plot(plot)
|
117 |
+
|
118 |
+
for dataset, value in lang_dict.items():
|
119 |
+
with gr.Tab(dataset_abbr_en_dict[dataset]):
|
120 |
+
for cat, df in value.items():
|
121 |
+
if cat == 'mc':
|
122 |
+
for shot in ['Zeroshot', 'Fewshot']:
|
123 |
+
with gr.Tab(f'Multiple Choice Question ({shot})'):
|
124 |
+
dataframe_to_gradio(df, is_mc=True, shot=shot)
|
125 |
+
else:
|
126 |
+
with gr.Tab('Question Answering'):
|
127 |
+
dataframe_to_gradio(df, is_mc=False)
|
128 |
+
|
129 |
+
|
130 |
|
131 |
def launch_gradio():
|
132 |
demo = gr.Blocks()
|
|
|
134 |
with demo:
|
135 |
gr.HTML(TITLE)
|
136 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
137 |
+
for key, dict in dict_lang.items():
|
138 |
with gr.Tab(key):
|
139 |
+
create_lang_leader_board(dict)
|
140 |
|
141 |
demo.launch()
|
142 |
|
143 |
+
pd.set_option('display.float_format', '{:.02f}'.format)
|
144 |
+
|
145 |
scheduler = BackgroundScheduler()
|
146 |
scheduler.add_job(launch_gradio, 'interval', hours=1)
|
147 |
scheduler.start()
|
data/bosc_zh_mc.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,52.50,52.50,62.50,62.50,57.50,57.50,57.50,57.50
|
3 |
+
Yi-34B-Chat,50.00,50.00,52.50,55.00,55.00,55.00,60.00,67.50
|
4 |
+
DevOps-Model-14B-Chat,50.00,50.00,55.00,62.50,35.00,27.50,37.50,52.50
|
5 |
+
LLaMA-2-7B,45.00,45.00,45.00,45.00,32.50,32.50,45.00,45.00
|
6 |
+
Qwen-72B-Chat,45.00,45.00,60.00,60.00,50.00,50.00,47.50,47.50
|
7 |
+
GPT-3.5-turbo,40.00,40.00,50.00,55.00,50.00,47.50,55.00,55.00
|
8 |
+
ERNIE-Bot-4.0,52.50,52.50,57.50,57.50,57.50,57.50,60.00,60.00
|
9 |
+
Mistral-7B,20.00,20.00,50.00,50.00,0.00,0.00,37.50,37.50
|
10 |
+
LLaMA-2-13B,50.00,50.00,42.50,42.50,42.50,42.50,50.00,50.00
|
11 |
+
Baichuan2-13B-Chat,37.50,37.50,42.50,45.00,37.50,40.00,47.50,52.50
|
12 |
+
Qwen-14B-Chat,50.00,47.50,55.00,57.50,47.50,45.00,50.00,47.50
|
13 |
+
LLaMA-2-70B-Chat,25.00,25.00,45.00,45.00,0.00,0.00,57.50,57.50
|
14 |
+
ChatGLM3-6B,47.5,47.5,45,45,35,35,50,50
|
15 |
+
InternLM2-Chat-20B,47.5,47.5,,,47.5,47.5,,
|
16 |
+
InternLM2-Chat-7B,55,55,62.5,62.5,60,60,57.5,57.5
|
17 |
+
gemma_2b,32.5,32.5,40,40 ,37.5,37.5,40,40
|
18 |
+
gemma_7b,40,40,50,50 ,32.5,32.5,62.5,62.5
|
19 |
+
qwen1.5-14b-base,47.5,47.5,45,45 ,47.5,47.5,50,50
|
20 |
+
qwen1.5-14b-chat,52.5,55,60,60,45,47.5,60,72.5
|
21 |
+
|
data/dfcdata_zh_mc.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,82.39,82.39,90.14,90.14,85.21,85.21,86.62,86.62
|
3 |
+
Yi-34B-Chat,86.62,86.62,76.06,85.21,78.87,80.28,85.92,86.62
|
4 |
+
DevOps-Model-14B-Chat,80.99,78.87,51.41,63.38,33.80,34.51,54.23,56.34
|
5 |
+
LLaMA-2-7B,45.07,45.07,61.97,61.97,30.28,30.28,45.77,45.77
|
6 |
+
Qwen-72B-Chat,83.80,83.80,83.80,83.80,86.62,86.62,83.80,83.80
|
7 |
+
GPT-3.5-turbo,71.13,73.24,80.28,78.87,77.46,76.06,82.39,81.69
|
8 |
+
ERNIE-Bot-4.0,83.00,83.00,85.00,85.00,81.00,81.00,82.00,82.00
|
9 |
+
Mistral-7B,16.90,16.90,64.08,64.08,2.82,2.82,64.79,64.79
|
10 |
+
LLaMA-2-13B,61.97,61.97,61.27,61.27,45.77,45.77,70.42,70.42
|
11 |
+
Baichuan2-13B-Chat,62.68,64.08,68.31,66.20,64.79,66.20,68.31,73.24
|
12 |
+
Qwen-14B-Chat,76.06,74.65,69.01,71.83,73.94,73.94,73.24,76.76
|
13 |
+
LLaMA-2-70B-Chat,41.55,40.85,72.54,72.54,14.79,14.79,67.61,67.61
|
14 |
+
ChatGLM3-6B,51.4084507,51.4084507,57.04225352,57.04225352,55.63380282,55.63380282,61.97183099,61.97183099
|
15 |
+
InternLM2-Chat-20B,78.16901408,78.16901408,,,74.64788732,74.64788732,74.64788732,74.64788732
|
16 |
+
InternLM2-Chat-7B,74.64788732,74.64788732,57.04225352,57.04225352,76.05633803,76.05633803,73.94366197,73.94366197
|
17 |
+
gemma_2b,27.46479,27.46479,41.5493,41.5493 ,28.16901,28.16901,38.02817,38.02817
|
18 |
+
gemma_7b,50.70423,50.70423,66.90141,66.90141 ,35.91549,35.91549,59.15493,59.15493
|
19 |
+
qwen1.5-14b-base,81.69014,81.69014,57.04225,57.04225 ,73.23944,73.23944,76.05634,76.05634
|
20 |
+
qwen1.5-14b-chat,83.80282,80.98592,78.87324,80.98592,75.35211,76.05634,80.28169,83.09859
|
21 |
+
|
data/gtja_zh_mc.csv
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,70.33,70.33,71.43,71.43,68.13,68.13,67.03,67.03
|
3 |
+
Yi-34B-Chat,69.23,70.33,49.45,47.25,71.43,74.73,71.43,73.63
|
4 |
+
DevOps-Model-14B-Chat,61.54,59.34,52.75,63.74,41.76,38.46,45.05,49.45
|
5 |
+
LLaMA-2-7B,42.86,42.86,45.05,45.05,28.57,28.57,45.05,45.05
|
6 |
+
Qwen-72B-Chat,70.33,70.33,74.73,74.73,71.43,71.43,67.03,67.03
|
7 |
+
GPT-3.5-turbo,47.25,52.75,57.14,58.24,49.45,52.75,59.34,62.64
|
8 |
+
ERNIE-Bot-4.0,65.93,65.93,68.13,68.13,68.13,68.13,64.84,64.84
|
9 |
+
Mistral-7B,14.29,14.29,38.46,38.46,5.49,5.49,47.25,47.25
|
10 |
+
LLaMA-2-13B,47.25,47.25,42.86,42.86,30.77,30.77,47.25,47.25
|
11 |
+
Baichuan2-13B-Chat,38.46,38.46,49.45,51.65,41.76,41.76,53.85,60.44
|
12 |
+
Qwen-14B-Chat,54.95,54.95,59.34,61.54,47.25,47.25,53.85,54.95
|
13 |
+
LLaMA-2-70B-Chat,19.78,19.78,49.45,49.45,6.59,6.59,48.35,48.35
|
14 |
+
ChatGLM3-6B,43.95604396,43.95604396,47.25274725,47.25274725,43.95604396,43.95604396,53.84615385,53.84615385
|
15 |
+
InternLM2-Chat-20B,65.93406593,65.93406593,,,56.04395604,56.04395604,,
|
16 |
+
InternLM2-Chat-7B,54.94505495,54.94505495,51.64835165,51.64835165,56.04395604,56.04395604,59.34065934,59.34065934
|
17 |
+
gemma_2b,32.96703,32.96703,29.67033,29.67033,30.76923,30.76923,43.95604,43.95604
|
18 |
+
gemma_7b,34.06593,34.06593,50.54945,50.54945,29.67033,29.67033,56.04396,56.04396
|
19 |
+
qwen1.5-14b-base,68.13187,68.13187,42.85714,42.85714,53.84615,53.84615,63.73626,63.73626
|
20 |
+
qwen1.5-14b-chat,59.34066,57.14286,60.43956,62.63736,56.04396,54.94505,67.03297,68.13187
|
data/huaweicloud_zh_mc.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,43.33,43.33,46.67,46.67,20.00,20.00,20.00,20.00
|
3 |
+
Yi-34B-Chat,50.00,46.67,30.00,43.33,36.67,40.00,36.67,30.00
|
4 |
+
DevOps-Model-14B-Chat,40.00,40.00,20.00,23.33,16.67,16.67,33.33,13.33
|
5 |
+
LLaMA-2-7B,16.67,16.67,33.33,33.33,10.00,10.00,26.67,26.67
|
6 |
+
Qwen-72B-Chat,43.33,43.33,33.33,36.67,36.67,36.67,33.33,33.33
|
7 |
+
GPT-3.5-turbo,20.00,20.00,16.67,23.33,13.33,13.33,20.00,26.67
|
8 |
+
ERNIE-Bot-4.0,36.67,36.67,23.33,23.33,16.67,16.67,20.00,20.00
|
9 |
+
Mistral-7B,0.00,0.00,16.67,16.67,0.00,0.00,23.33,23.33
|
10 |
+
LLaMA-2-13B,26.67,26.67,13.33,13.33,10.00,10.00,20.00,20.00
|
11 |
+
Baichuan2-13B-Chat,16.67,20.00,26.67,30.00,6.67,10.00,23.33,23.33
|
12 |
+
Qwen-14B-Chat,40.00,30.00,26.67,33.33,13.33,13.33,20.00,26.67
|
13 |
+
LLaMA-2-70B-Chat,23.33,23.33,16.67,16.67,3.33,3.33,20.00,20.00
|
14 |
+
ChatGLM3-6B,6.666666667,6.666666667,13.33333333,13.33333333,13.33333333,13.33333333,16.66666667,16.66666667
|
15 |
+
InternLM2-Chat-20B,16.66666667,16.66666667,,,13.33333333,13.33333333,20,20
|
16 |
+
InternLM2-Chat-7B,30,30,40,40,43.33333333,43.33333333,23.33333333,23.33333333
|
17 |
+
gemma_2b,26.66667,26.66667,20,20 ,26.66667,26.66667,10,10
|
18 |
+
gemma_7b,13.33333,13.33333,30,30 ,3.333333,3.333333,23.33333,23.33333
|
19 |
+
qwen1.5-14b-base,20,20,30,30 ,20,20,33.33333,33.33333
|
20 |
+
qwen1.5-14b-chat,26.66667,26.66667,20,30,23.33333,26.66667,13.33333,13.33333
|
21 |
+
|
data/lenovo_en_mc.csv
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
ChatGLM3-6B,60,60,60,60,55,55,60,60
|
3 |
+
InternLM2-Chat-20B,62.5,62.5,,,75,75,,
|
4 |
+
InternLM2-Chat-7B,65,65,67.5,67.5,75,75,57.5,57.5
|
5 |
+
gemma_2b,22.5,22.5,47.5,47.5 ,30,30,37.5,37.5
|
6 |
+
gemma_7b,32.5,32.5,65,65 ,35,35,65,65
|
7 |
+
qwen1.5-14b-base,67.5,67.5,70,70 ,72.5,72.5,50,50
|
8 |
+
qwen1.5-14b-chat,67.5,67.5,70,70,72.5,65,77.5,77.5
|
9 |
+
|
data/lenovo_zh_mc.csv
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,77.50,77.50,82.50,82.50,77.50,77.50,82.50,82.50
|
3 |
+
Yi-34B-Chat,62.50,57.50,52.50,52.50,75.00,75.00,87.50,82.50
|
4 |
+
DevOps-Model-14B-Chat,67.50,70.00,62.50,70.00,60.00,67.50,65.00,57.50
|
5 |
+
LLaMA-2-7B,60.00,60.00,55.00,55.00,32.50,32.50,45.00,45.00
|
6 |
+
Qwen-72B-Chat,75.00,75.00,75.00,75.00,72.50,72.50,75.00,75.00
|
7 |
+
GPT-3.5-turbo,57.50,57.50,62.50,62.50,60.00,62.50,65.00,70.00
|
8 |
+
ERNIE-Bot-4.0,75.00,75.00,82.50,82.50,75.00,75.00,77.50,77.50
|
9 |
+
Mistral-7B,35.00,35.00,60.00,60.00,47.50,47.50,62.50,62.50
|
10 |
+
LLaMA-2-13B,60.00,60.00,55.00,55.00,45.00,45.00,62.50,62.50
|
11 |
+
ChatGLM3-6B,55.00,55.00,60.00,60.00,60.00,60.00,60.00,60.00
|
12 |
+
Baichuan2-13B-Chat,62.50,60.00,70.00,67.50,65.00,60.00,72.50,67.50
|
13 |
+
Qwen-14B-Chat,70.00,65.00,65.00,67.50,70.00,67.50,70.00,67.50
|
14 |
+
LLaMA-2-70B-Chat,20.00,20.00,57.50,57.50,22.50,22.50,75.00,75.00
|
15 |
+
|
data/network_en_mc.csv
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
Baichuan-13B-Chat,18.3,20.4,28.6,37,24.1,26.7,18.2,17.8
|
3 |
+
Chinese-Alpaca-2-13B,37.7,37.7,49.7,49.7,48.6,48.6,50.5,50.5
|
4 |
+
GPT-3.5-turbo,66.6,66.8,69.6,72,68.3,68.3,70.9,72.5
|
5 |
+
LLaMA-2-13B,41.8,46.5,53.1,58.7,53.3,53,56.8,61
|
6 |
+
Qwen-7B-Chat,45.9,46,47.3,50.1,52.1,51,48.3,49.8
|
7 |
+
ChatGLM2-6B,24.8,24.7,36.6,36.5,37.6,37.6,40.5,40.5
|
8 |
+
Chinese-LLaMA-2-13B,29.4,29.4,37.8,37.8,40.4,40.4,28.8,28.8
|
9 |
+
InternLM-7B,38.7,38.7,43.9,43.9,45.2,45.2,51.4,51.4
|
10 |
+
LLaMA-2-7B,39.5,40,45.4,49.5,48.2,46.8,52,55.2
|
11 |
+
Baichuan2-13B-Chat,14.1,15.3,24.1,25.8,32.3,33.1,25.6,27.7
|
12 |
+
GPT-4,/,/,/,/,/,/,88.7,88.7
|
13 |
+
AquilaChat2-34B,36.63,36.63,44.83,44.83,46.65,46.65,NULL,NULL
|
14 |
+
Yi-34B-Chat,57.75,59.14,65.11,68.79,68.16,68.37,78.09,80.06
|
15 |
+
DevOps-Model-14B-Chat,30.69,30.59,55.77,63.63,63.85,61.96,41.15,44.01
|
16 |
+
Qwen-72B-Chat,70.41,70.50,72.38,72.56,70.32,70.32,70.13,70.22
|
17 |
+
Mistral-7B,29.27,29.27,46.30,46.30,47.22,47.22,45.58,45.58
|
18 |
+
Qwen-14B-Chat,43.78,47.81,56.58,59.40,62.09,59.70,49.06,55.88
|
19 |
+
LLaMA-2-70B-Chat,25.29,25.29,57.97,58.06,52.97,52.97,58.55,58.55
|
20 |
+
ERNIE-Bot-4.0,61.15,61.15,70.00,70.00,60.00,60.00,70.00,70.00
|
21 |
+
ChatGLM3-6B,43.38487973,43.38487973,44.58762887,44.58762887,42.09621993,42.09621993,43.47079038,43.47079038
|
22 |
+
InternLM2-Chat-20B,56.35738832,56.35738832,26.18025751,26.18025751,60.48109966,60.48109966,45.10309278,45.10309278
|
23 |
+
InternLM2-Chat-7B,49.74226804,49.74226804,56.18556701,56.18556701,48.19587629,48.19587629,49.74226804,49.74226804
|
24 |
+
gemma_2b,26.46048,26.46048,33.41924,33.41924 ,26.6323,26.6323,37.54296,37.54296
|
25 |
+
gemma_7b,25.08591,25.08591,50.85911,50.85911 ,30.24055,30.24055,51.55747,51.55747
|
26 |
+
qwen1.5-14b-base,34.87973,34.87973,60.82474,60.82474 ,65.54983,65.54983,47.07904,47.07904
|
27 |
+
qwen1.5-14b-chat,54.89691,56.4433,64.08935,67.09622,52.23368,53.52234,59.53608,64.17526
|
28 |
+
|
data/network_en_qa.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,rouge1,rouge2,rouge_l,rouge_lsum,score,bp,sys_len,gpt4_score
|
2 |
+
GPT-3.5-turbo,13.38,5.65,12.13,12.26,6.78,1,2966,8.47
|
3 |
+
LLaMA-2-70B,8.69,2.51,7.62,7.74,4.2,1,4970,7.28
|
4 |
+
LLaMA-2-13B,5.75,1.68,5.03,4.98,3.43,1,8239,7.16
|
5 |
+
Chinese-Alpaca-2-13B,3.48,0.96,3.19,3.25,1.85,1,14716,6.66
|
6 |
+
Baichuan-13B-Chat,5.58,1.85,4.66,4.76,0.35,1,9577,5.85
|
7 |
+
Qwen-7B-Chat,13.03,4.76,11.61,11.82,4.33,1,3091,5.63
|
8 |
+
ChatGLM2-6B,10.43,3.24,9.82,9.71,5.07,0.91,2492,4.88
|
9 |
+
InternLM-7B,14.34,5.39,13.3,13.27,0.54,1,3112,4.52
|
10 |
+
Chinese-LLaMA-2-13B,9.18,2.9,9.22,9.19,0.24,1,32006,2.39
|
11 |
+
|
data/network_zh_mc.csv
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
Baichuan-13B-Chat,15.2,16,43.9,49.7,34.3,36.1,51.3,55.6
|
3 |
+
Chinese-Alpaca-2-13B,33.1,33.1,44.2,44.2,44,44,42.7,42.7
|
4 |
+
GPT-3.5-turbo,58.4,58.6,64.8,67.6,59.2,59.7,65.2,67.4
|
5 |
+
LLaMA-2-13B,29.7,31.6,51.6,57,39.6,38.9,48,50.6
|
6 |
+
Qwen-7B-Chat,29.6,29.9,50.6,53.5,50.4,46.9,46.9,47.7
|
7 |
+
ChatGLM2-6B,33.8,33.7,42.1,42.2,36,36,39.5,39.5
|
8 |
+
Chinese-LLaMA-2-13B,22.5,22.5,38.8,38.8,41.8,41.8,32.2,32.2
|
9 |
+
InternLM-7B,41.7,41.7,38.4,38.4,42.6,42.6,41.3,41.3
|
10 |
+
LLaMA-2-7B,29.8,30.2,50.1,55.6,38.6,40.8,45.6,50.4
|
11 |
+
Baichuan2-13B-Chat,35.6,35.9,30.5,30.5,34.6,35.6,30.2,32.0
|
12 |
+
GPT-4,/,/,/,/,/,/,86,86
|
13 |
+
AquilaChat2-34B,34.66,34.66,47.74,47.74,44.48,44.48,NULL,NULL
|
14 |
+
Yi-34B-Chat,61.61,62.56,68.11,69.75,65.73,65.37,69.88,71.21
|
15 |
+
DevOps-Model-14B-Chat,47.59,46.57,52.52,56.01,62.07,60.08,50.59,55.79
|
16 |
+
Qwen-72B-Chat,65.77,65.86,68.13,68.30,69.40,69.40,69.99,70.08
|
17 |
+
Mistral-7B,1.90,1.90,45.61,45.61,15.00,15.00,35.97,35.97
|
18 |
+
Qwen-14B-Chat,48.35,48.81,55.35,57.40,58.53,56.12,52.12,54.99
|
19 |
+
LLaMA-2-70B-Chat,38.55,38.55,57.49,57.49,49.09,49.09,48.57,48.57
|
20 |
+
ERNIE-Bot-4.0,67.54,67.54,71.96,71.96,72.00,72.00,78.00,78.00
|
21 |
+
Hunyuan-13B,60.00,60.00,70.00,70.00,,,,
|
22 |
+
ChatGLM3-6B,41.39414802,41.39414802,49.22547332,49.22547332,38.81239243,38.81239243,42.85714286,42.85714286
|
23 |
+
InternLM2-Chat-20B,57.48709122,57.48709122,57.14285714,57.14285714,59.1222031,59.1222031,50.77452668,50.77452668
|
24 |
+
InternLM2-Chat-7B,54.30292599,54.30292599,59.81067126,59.81067126,58.51979346,58.51979346,51.63511188,51.63511188
|
25 |
+
GLM3-turbo,59.63855422,59.63855422,,,,,,
|
26 |
+
GLM4,67.383821,67.383821,,,,,,
|
27 |
+
gemma_2b,29.69019,29.69019,39.15663,39.15663 ,29.77625,29.77625,38.64028,38.64028
|
28 |
+
gemma_7b,31.58348,31.58348,47.59036,47.59036 ,34.68158,34.68158,48.88124,48.88124
|
29 |
+
qwen1.5-14b-base,45.18072,45.18072,59.1222,59.1222 ,61.10155,61.10155,52.4957,52.4957
|
30 |
+
qwen1.5-14b-chat,54.04475,53.87263,62.56454,63.85542,58.77797,58.0895,63.42513,65.57659
|
31 |
+
|
data/network_zh_qa.csv
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,rouge1,rouge2,rouge_l,rouge_lsum,score,bp,sys_len,gpt4_score
|
2 |
+
GPT-3.5-turbo,17.28,6.39,16.84,16.87,1.89,0.74,368,6.98
|
3 |
+
ChatGLM2-6B,6.92,1.97,6.83,6.75,0.11,1,1867,4.46
|
4 |
+
InternLM-7B,2.76,1.03,2.76,2.76,0.01,1,6053,2.22
|
5 |
+
Baichuan-13B-Chat,9.09,3.67,9.04,9.2,0.53,1,1125,5.14
|
6 |
+
LLaMA-2-13B,4.29,1.29,4.2,4.22,0.23,1,1581,5.03
|
7 |
+
Chinese-LLaMA-2-13B,4.96,4.11,4.7,4.73,0.01,1,11371,1.77
|
8 |
+
Chinese-Alpaca-2-13B,10.03,2.19,9.86,9.97,0.02,1,2605,4.71
|
9 |
+
Qwen-7B-Chat,10,2.45,9.94,10.05,0.23,0.42,257,5.07
|
10 |
+
|
data/oracle_en_mc.csv
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
Baichuan-13B-Chat,12.47,11.67,16.50,19.52,24.55,22.54,26.36,28.77
|
3 |
+
Chinese-Alpaca-2-13B,23.14,23.14,28.97,28.97,16.30,16.30,14.29,14.29
|
4 |
+
GPT-3.5-turbo,38.63,38.83,40.04,42.05,36.62,37.63,42.66,43.86
|
5 |
+
LLaMA-2-13B,16.10,20.32,23.94,29.58,20.12,22.33,24.35,33.80
|
6 |
+
Qwen-7B-Chat,18.91,19.11,22.13,23.94,26.76,25.55,34.81,33.40
|
7 |
+
ChatGLM2-6B,20.72,20.52,19.92,19.72,20.12,20.12,22.94,22.74
|
8 |
+
Chinese-LLaMA-2-13B,13.88,13.88,20.52,20.52,16.90,16.90,23.34,23.34
|
9 |
+
InternLM-7B,26.36,26.36,25.55,25.55,25.55,25.55,27.97,27.97
|
10 |
+
LLaMA-2-7B,22.13,23.74,23.74,26.56,19.32,20.52,28.77,33.60
|
11 |
+
Baichuan2-13B-Chat,17.1,19.1,18.7,22.9,25.9,26.5,20.9,24.5
|
12 |
+
GPT-4,/,/,59.02,64.56,/,/,58.35,62.58
|
13 |
+
AquilaChat2-34B,36.63,36.63,44.83,44.83,46.65,46.65,NULL,NULL
|
14 |
+
Yi-34B-Chat,47.08,48.69,47.08,46.28,58.15,58.35,56.94,58.95
|
15 |
+
DevOps-Model-14B-Chat,25.15,26.96,35.41,38.83,33.20,34.81,27.36,27.36
|
16 |
+
Qwen-72B-Chat,47.28,47.48,48.09,48.09,49.70,49.70,43.46,43.66
|
17 |
+
ERNIE-Bot-4.0,43.80,43.80,47.14,47.14,46.00,46.00,54.00,54.00
|
18 |
+
Mistral-7B,17.10,17.10,26.76,26.76,31.19,31.19,27.97,27.97
|
19 |
+
Qwen-14B-Chat,24.95,28.37,33.00,36.62,27.97,28.37,27.97,24.14
|
20 |
+
LLaMA-2-70B-Chat,19.72,19.72,27.97,27.97,26.56,26.56,32.60,32.60
|
21 |
+
ChatGLM3-6B,20.92555332,20.92555332,25.15090543,25.15090543,24.74849095,24.74849095,29.1750503,29.1750503
|
22 |
+
InternLM2-Chat-20B,,,59.21052632,59.21052632,,,,
|
23 |
+
InternLM2-Chat-7B,27.16297787,27.16297787,28.16901408,28.16901408,29.97987928,29.97987928,30.18108652,30.18108652
|
24 |
+
gemma_2b,16.90141,16.90141,19.5171,19.5171 ,16.09658,16.09658,24.74849,24.74849
|
25 |
+
gemma_7b,14.28571,14.28571,30.98592,30.98592 ,2.60223,2.60223,43.85965,43.85965
|
26 |
+
qwen1.5-14b-base,29.17505,29.17505,33.60161,33.60161 ,36.82093,36.82093,27.7666,27.7666
|
27 |
+
qwen1.5-14b-chat,32.79678,35.41247,39.43662,43.05835,32.39437,33.60161,36.82093,38.833
|
28 |
+
|
data/oracle_zh_mc.csv
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
Baichuan-13B-Chat,12.88,12.07,25.96,27.57,18.91,19.52,27.97,30.58
|
3 |
+
Chinese-Alpaca-2-13B,22.94,22.94,25.75,25.75,25.15,25.15,22.33,22.33
|
4 |
+
GPT-3.5-turbo,36.42,35.81,39.24,43.26,39.84,39.44,27.16,27.77
|
5 |
+
LLaMA-2-13B,23.94,24.35,29.58,31.99,24.55,26.76,21.13,20.72
|
6 |
+
Qwen-7B-Chat,18.51,17.71,27.36,28.37,29.78,29.58,33.60,31.79
|
7 |
+
ChatGLM2-6B,23.34,23.34,24.35,24.14,22.94,22.94,26.16,26.16
|
8 |
+
Chinese-LLaMA-2-13B,14.69,14.69,19.92,19.92,19.72,19.72,20.93,20.93
|
9 |
+
InternLM-7B,25.96,25.96,25.96,25.96,29.18,29.18,28.37,28.37
|
10 |
+
LLaMA-2-7B,20.72,20.72,27.16,27.97,21.53,18.51,18.31,17.91
|
11 |
+
Baichuan2-13B-Chat,25.7,25.5,20.1,21.3,27.7,26.7,22.7,24.7
|
12 |
+
GPT-4,/,/,59.38,65.17,/,/,44.06,48.09
|
13 |
+
AquilaChat2-34B,34.66,34.66,47.74,47.74,44.48,44.48,NULL,NULL
|
14 |
+
Mistral-7B,1.90,1.90,45.61,45.61,15.00,15.00,35.97,35.97
|
15 |
+
Yi-34B-Chat,49.90,49.30,52.72,53.72,56.34,56.34,51.31,54.33
|
16 |
+
DevOps-Model-14B-Chat,24.75,22.74,28.37,27.77,36.62,37.02,27.57,26.36
|
17 |
+
Qwen-72B-Chat,48.29,48.49,49.50,49.70,49.70,49.70,45.27,44.87
|
18 |
+
ERNIE-Bot-4.0,48.56,48.56,50.64,50.64,48.00,48.00,54.00,54.00
|
19 |
+
Mistral-7B,0.20,0.20,26.76,26.76,10.26,10.26,32.19,32.19
|
20 |
+
Qwen-14B-Chat,27.57,27.57,32.39,36.02,40.04,35.41,30.38,33.40
|
21 |
+
LLaMA-2-70B-Chat,15.29,15.29,34.81,34.81,26.76,26.76,33.80,33.80
|
22 |
+
ChatGLM3-6B,21.32796781,21.32796781,28.97384306,28.97384306,21.73038229,21.73038229,29.57746479,29.57746479
|
23 |
+
InternLM2-Chat-7B,28.57142857,28.57142857,31.79074447,31.79074447,30.78470825,30.78470825,31.18712274,31.18712274
|
24 |
+
gemma_2b,18.51107,18.51107,24.9497,24.9497 ,21.52918,21.52918,27.7666,27.7666
|
25 |
+
gemma_7b,19.3159,19.3159,53.94737,53.94737 ,18.51107,18.51107,5.204461,5.204461
|
26 |
+
qwen1.5-14b-base,20.92555,20.92555,35.61368,35.61368 ,41.44869,41.44869,30.78471,30.78471
|
27 |
+
qwen1.5-14b-chat,24.14487,23.34004,40.64386,41.04628,38.22938,38.02817,39.43662,40.04024
|
28 |
+
|
data/pufa_zh_mc.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,84.00,84.00,90.67,90.67,88.00,88.00,86.67,86.67
|
3 |
+
Yi-34B-Chat,90.67,92.00,78.67,89.33,84.00,84.00,88.00,88.00
|
4 |
+
DevOps-Model-14B-Chat,82.67,81.33,53.33,70.67,29.33,29.33,62.67,61.33
|
5 |
+
LLaMA-2-7B,48.00,48.00,52.00,52.00,25.33,25.33,40.00,40.00
|
6 |
+
Qwen-72B-Chat,88.00,88.00,82.67,82.67,90.67,90.67,85.33,85.33
|
7 |
+
GPT-3.5-turbo,76.00,78.67,84.00,82.67,77.33,77.33,84.00,81.33
|
8 |
+
ERNIE-Bot-4.0,82.67,82.67,86.67,86.67,86.67,86.67,86.67,86.67
|
9 |
+
Mistral-7B,22.67,22.67,54.67,54.67,4.00,4.00,58.67,58.67
|
10 |
+
LLaMA-2-13B,61.33,61.33,53.33,53.33,44.00,44.00,68.00,68.00
|
11 |
+
Baichuan2-13B-Chat,62.67,61.33,62.67,62.67,65.33,66.67,66.67,66.67
|
12 |
+
Qwen-14B-Chat,73.33,73.33,72.00,80.00,73.33,73.33,69.33,72.00
|
13 |
+
LLaMA-2-70B-Chat,49.33,49.33,66.67,66.67,6.67,6.67,65.33,65.33
|
14 |
+
ChatGLM3-6B,56,56,58.66666667,58.66666667,60,60,61.33333333,61.33333333
|
15 |
+
InternLM2-Chat-20B,80,80,,,76,76,80,80
|
16 |
+
InternLM2-Chat-7B,72,72,53.33333333,53.33333333,78.66666667,78.66666667,72,72
|
17 |
+
gemma_2b,36,36,30.66667,30.66667,36,36,41.33333,41.33333
|
18 |
+
gemma_7b,46.66667,46.66667,56,56,34.66667,34.66667,56,56
|
19 |
+
qwen1.5-14b-base,92,92,42.66667,42.66667,78.66667,78.66667,72,72
|
20 |
+
qwen1.5-14b-chat,78.66667,80,86.66667,85.33333,86.66667,89.33333,85.33333,85.33333
|
21 |
+
|
data/rzy_zh_mc.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,65.28,65.28,68.19,68.19,65.56,65.56,68.05,68.05
|
3 |
+
Yi-34B-Chat,60.17,60.03,57.68,57.54,64.45,64.59,67.77,67.36
|
4 |
+
DevOps-Model-14B-Chat,65.28,64.18,55.19,61.83,53.67,56.85,54.50,59.20
|
5 |
+
LLaMA-2-7B,46.20,46.20,53.39,53.39,34.85,34.85,44.95,44.95
|
6 |
+
Qwen-72B-Chat,65.98,65.98,70.12,70.12,66.67,66.67,65.28,65.28
|
7 |
+
GPT-3.5-turbo,65.28,66.25,68.05,68.74,65.28,65.42,66.39,67.50
|
8 |
+
ERNIE-Bot-4.0,73.00,73.00,77.00,77.00,76.00,76.00,79.00,79.00
|
9 |
+
Mistral-7B,29.88,29.88,59.75,59.75,18.53,18.53,60.30,60.30
|
10 |
+
LLaMA-2-13B,57.12,57.12,53.39,53.39,51.18,51.18,59.06,59.06
|
11 |
+
Baichuan2-13B-Chat,59.06,59.34,64.45,64.32,60.17,60.17,62.79,67.50
|
12 |
+
Qwen-14B-Chat,65.28,63.49,62.93,65.98,61.96,61.55,61.55,64.45
|
13 |
+
LLaMA-2-70B-Chat,48.82,48.82,59.75,59.75,5.26,5.26,62.52,62.52
|
14 |
+
ChatGLM3-6B,55.32503458,55.32503458,59.33609959,59.33609959,54.21853389,54.21853389,62.10235131,62.10235131
|
15 |
+
InternLM2-Chat-20B,,,,,63.90041494,63.90041494,64.03872752,64.03872752
|
16 |
+
InternLM2-Chat-7B,65.00691563,65.00691563,54.21853389,54.21853389,61.2724758,61.2724758,63.62378976,63.62378976
|
17 |
+
gemma_2b,33.60996,33.60996,37.75934,37.75934 ,36.37621,36.37621,45.22822,45.22822
|
18 |
+
gemma_7b,42.04703,42.04703,56.70816,56.70816 ,39.41909,39.41909,54.77178,54.77178
|
19 |
+
qwen1.5-14b-base,65.42185,65.42185,50.89903,50.89903 ,51.17566,51.17566,62.6556,62.6556
|
20 |
+
qwen1.5-14b-chat,63.34716,63.7621,65.42185,65.9751,62.93223,64.03873,64.59198,64.31535
|
21 |
+
|
data/tencent_zh_qa.csv
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,gpt4_score
|
2 |
+
Baichuan2-13B-Chat,8.727272727272727
|
3 |
+
DevOps-Model-14B-Chat,8.25974026
|
4 |
+
LLaMA-2-13B,7.636363636363637
|
5 |
+
LLaMA-2-70B-Chat,7.740259740259741
|
6 |
+
Mistral-7B,7.8441558441558445
|
7 |
+
Qwen-14B-Chat,8.642857142857142
|
8 |
+
Qwen-72B-Chat,8.811688311688311
|
9 |
+
GPT4,9.019480519480519
|
10 |
+
Yi-34B-Chat,8.844155844155845
|
11 |
+
ChatGLM3-6B,8.577922077922079
|
12 |
+
LLaMA-2-7B,5.318181818181818
|
13 |
+
GPT-3.5-turbo,8.850649351
|
14 |
+
|
data/zabbix_zh_mc.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
|
2 |
+
GPT4,60.00,60.00,59.00,59.00,51.00,51.00,53.00,53.00
|
3 |
+
Yi-34B-Chat,42.00,42.00,42.00,42.00,40.00,40.00,40.00,40.00
|
4 |
+
DevOps-Model-14B-Chat,46.00,44.00,44.00,46.00,27.00,28.00,36.00,33.00
|
5 |
+
LLaMA-2-7B,22.00,22.00,28.00,28.00,18.00,18.00,35.00,35.00
|
6 |
+
Qwen-72B-Chat,45.00,45.00,61.00,61.00,46.00,46.00,44.00,44.00
|
7 |
+
GPT-3.5-turbo,40.00,40.00,48.00,48.00,36.00,36.00,42.00,42.00
|
8 |
+
ERNIE-Bot-4.0,47.00,47.00,51.00,51.00,44.00,44.00,48.00,48.00
|
9 |
+
Mistral-7B,11.00,11.00,44.00,44.00,6.00,6.00,42.00,42.00
|
10 |
+
LLaMA-2-13B,40.00,40.00,43.00,43.00,28.00,28.00,45.00,45.00
|
11 |
+
Baichuan2-13B-Chat,29.00,27.00,40.00,43.00,31.00,29.00,47.00,47.00
|
12 |
+
Qwen-14B-Chat,44.00,40.00,47.00,43.00,36.00,36.00,39.00,41.00
|
13 |
+
LLaMA-2-70B-Chat,29.00,29.00,46.00,46.00,1.00,1.00,47.00,47.00
|
14 |
+
ChatGLM3-6B,29,29,34,34,29,29,36,36
|
15 |
+
InternLM2-Chat-20B,44,44,,,41,41,,
|
16 |
+
InternLM2-Chat-7B,45,45,35,35,43,43,39,39
|
17 |
+
gemma_2b,24,24,30,30,25,25,32,32
|
18 |
+
gemma_7b,28,28,40,40 ,22,22,44,44
|
19 |
+
qwen1.5-14b-base,48,48,36,36,38,38,39,39
|
20 |
+
qwen1.5-14b-chat,42,39,48,49,34,34,45,43
|
21 |
+
|
data/zjyd_zh_mc.csv
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
Baichuan-13B-Chat,11.04,11.13,26.92,28.61,14.35,13.22,31.69,33.97
|
3 |
+
Chinese-Alpaca-2-13B,22.69,22.69,24.59,24.59,40.52,40.52,40.73,40.73
|
4 |
+
GPT-3.5-turbo,37.06,36.83,37.56,39.25,39.42,39.77,41.96,42.15
|
5 |
+
LLaMA-2-13B,25.43,27.16,29.17,29.99,36.56,36.15,37.70,39.02
|
6 |
+
Qwen-7B-Chat,36.28,36.50,33.18,33.51,41.58,40.59,31.48,31.46
|
7 |
+
ChatGLM2-6B,23.09,23.12,24.22,24.08,30.46,30.46,35.97,35.90
|
8 |
+
Chinese-LLaMA-2-13B,17.98,17.98,17.83,17.83,31.66,31.66,36.24,36.24
|
9 |
+
InternLM-7B,27.81,27.81,19.95,19.95,24.18,24.18,35.35,35.35
|
10 |
+
LLaMA-2-7B,24.09,23.47,28.69,29.26,29.94,30.03,31.35,31.93
|
11 |
+
GPT-4,/,/,57.35,62.11,/,/,61.20,65.68
|
12 |
+
Yi-34B-Chat,64.91,64.58,62.77,65.51,70.85,70.92,48.77,47.97
|
13 |
+
DevOps-Model-14B-Chat,41.04,42.70,48.71,53.57,56.85,57.25,51.30,54.29
|
14 |
+
Qwen-72B-Chat,64.79,64.79,65.79,65.72,70.19,70.19,68.31,68.38
|
15 |
+
ERNIE-Bot-4.0,45.99,45.99,48.98,48.98,46.00,46.00,54.00,54.00
|
16 |
+
Mistral-7B,1.27,1.27,42.05,42.05,30.72,30.72,46.44,46.44
|
17 |
+
Qwen-14B-Chat,41.71,41.44,45.58,47.98,53.52,49.92,54.72,58.85
|
18 |
+
LLaMA-2-70B-Chat,24.38,24.38,43.63,43.63,44.65,44.65,48.84,48.84
|
19 |
+
ChatGLM3-6B,32.6,32.6,35.4,35.4,28.3,28.3,40.9,40.9
|
20 |
+
InternLM2-Chat-20B,44.6,44.6,47,47,62.2,62.2,38.3,38.3
|
21 |
+
InternLM2-Chat-7B,38.8,38.8,44.6,44.6,46,46,35.8,35.8
|
22 |
+
GLM3-turbo,43,43,,,,,,
|
23 |
+
GLM4,50,50,,,,,,
|
24 |
+
gemma_2b,25.6,25.6,28.3,28.3 ,19.1,19.1,35.5,35.5
|
25 |
+
gemma_7b,27.3,27.3,35.4,35.4 ,17.3,17.3,44.5,44.5
|
26 |
+
qwen1.5-14b-base,49.1,49.1,49.9,49.9 ,62.5,62.5,41.3,41.3
|
27 |
+
qwen1.5-14b-chat,38.6,38.9,48.8,50.5,54.6,55.2,52.1,52.7
|
28 |
+
|
data/zjyd_zh_qa.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
GPT-3.5-turbo,79.48718,81.19658,88.39286,89.28571 ,79.48717949,81.1965812,86.60714286,88.39285714
|
3 |
+
Gemma-2B,26.49573,26.49573,62.5,62.5 ,58.97435897,58.97435897,75,75
|
4 |
+
Gemma-7B,77.77778,77.77778,75.89286,75.89286 ,76.06837607,76.06837607,86.60714286,86.60714286
|
5 |
+
Qwen1.5-0.5B-Base,65.17857,65.17857,75,75 ,56.25,56.25,57.14285714,57.14285714
|
6 |
+
Qwen1.5-0.5B-Chat,0,0,54.46429,53.57143 ,20.53571429,19.64285714,16.96428571,17.85714286
|
7 |
+
Qwen1.5-1.8B-Base,71.42857,71.42857,71.42857,71.42857 ,70.53571429,70.53571429,80.35714286,80.35714286
|
8 |
+
Qwen1.5-1.8B-Chat,73.21429,69.64286,67.85714,77.67857 ,66.07142857,66.07142857,68.75,75
|
9 |
+
Qwen1.5-14B-Base,76.92308,76.92308,88.39286,88.39286 ,78.63247863,78.63247863,83.03571429,83.03571429
|
10 |
+
Qwen1.5-14B-Chat,18.75,23.21429,91.07143,92.85714 ,79.46428571,78.57142857,80.35714286,83.92857143
|
11 |
+
|
data/zte_en_mc.csv
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
Baichuan-13B-Chat,11.60,14.31,14.68,18.46,14.56,15.68,16.21,16.82
|
3 |
+
Chinese-Alpaca-2-13B,20.86,20.86,23.08,23.08,29.75,29.75,32.83,32.83
|
4 |
+
GPT-3.5-turbo,35.04,34.82,38.46,43.50,39.29,39.19,41.01,42.58
|
5 |
+
LLaMA-2-13B,15.62,18.32,29.88,34.45,23.16,29.14,37.59,44.30
|
6 |
+
Qwen-7B-Chat,33.37,33.74,32.97,34.10,32.98,32.70,36.60,36.65
|
7 |
+
ChatGLM2-6B,15.94,16.06,19.83,19.91,26.27,26.22,28.25,28.37
|
8 |
+
Chinese-LLaMA-2-13B,10.02,10.02,19.51,19.51,34.51,34.51,33.34,33.34
|
9 |
+
InternLM-7B,20.48,20.48,23.85,23.85,23.69,23.69,26.06,26.06
|
10 |
+
LLaMA-2-7B,19.42,21.62,25.46,27.11,21.45,24.85,33.60,34.83
|
11 |
+
GPT-4,/,/,56.90,65.49,/,/,59.39,63.54
|
12 |
+
Yi-34B-Chat,38.24,37.04,48.24,52.10,61.33,61.19,53.53,53.39
|
13 |
+
DevOps-Model-14B-Chat,31.04,30.51,42.84,47.37,52.25,49.38,45.90,47.23
|
14 |
+
Qwen-72B-Chat,53.19,53.19,55.25,55.52,58.13,58.13,58.72,58.99
|
15 |
+
ERNIE-Bot-4.0,43.66,43.66,51.99,51.99,44.00,44.00,50.00,50.00
|
16 |
+
Mistral-7B,26.91,26.91,30.65,30.65,40.52,40.52,46.84,46.84
|
17 |
+
Qwen-14B-Chat,33.71,36.25,41.24,42.51,51.19,50.39,57.18,59.18
|
18 |
+
LLaMA-2-70B-Chat,23.64,23.64,39.31,39.31,38.98,39.12,47.90,47.90
|
19 |
+
ChatGLM3-6B,30.4,30.4,30.7,30.7,26.9,26.9,37.2,37.2
|
20 |
+
InternLM2-Chat-20B,39.1,39.1,37.7,37.7,47.7,47.7,33.5,33.5
|
21 |
+
InternLM2-Chat-7B,36.8,36.8,31.7,31.7,46.3,46.3,36.9,36.9
|
22 |
+
gemma_2b,20.1,20.1,24.2,24.2 ,31.2,31.2,35.5,35.5
|
23 |
+
gemma_7b,23.1,23.1,34.4,34.4 ,21.4,21.4,33.1,33.1
|
24 |
+
qwen1.5-14b-base,34,34,42.8,42.8 ,57.9,57.9,40.2,40.2
|
25 |
+
qwen1.5-14b-chat,34.5,35.6,41.7,41.1,33.2,34.7,46.2,47.4
|
26 |
+
|
data/zte_zh_mc.csv
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
|
2 |
+
Yi-34B-Chat,64.91,64.58,62.77,65.51,70.85,70.92,48.77,47.97
|
3 |
+
Qwen-72B-Chat,64.79,64.79,65.79,65.72,70.19,70.19,68.31,68.38
|
4 |
+
GPT-4,/,/,57.35,62.11,/,/,61.2,65.68
|
5 |
+
qwen1.5-14b-base,49.1,49.1,49.9,49.9,62.5,62.5,41.3,41.3
|
6 |
+
InternLM2-Chat-20B,44.6,44.6,47,47,62.2,62.2,38.3,38.3
|
7 |
+
Qwen-14B-Chat,41.71,41.44,45.58,47.98,53.52,49.92,54.72,58.85
|
8 |
+
DevOps-Model-14B-Chat,41.04,42.7,48.71,53.57,56.85,57.25,51.3,54.29
|
9 |
+
qwen1.5-14b-chat,38.6,38.9,48.8,50.5,54.6,55.2,52.1,52.7
|
10 |
+
ERNIE-Bot-4.0,45.99,45.99,48.98,48.98,46,46,54,54
|
11 |
+
GLM4,50,50,,,,,,
|
12 |
+
LLaMA-2-70B-Chat,24.38,24.38,43.63,43.63,44.65,44.65,48.84,48.84
|
13 |
+
Mistral-7B,1.27,1.27,42.05,42.05,30.72,30.72,46.44,46.44
|
14 |
+
InternLM2-Chat-7B,38.8,38.8,44.6,44.6,46,46,35.8,35.8
|
15 |
+
gemma_7b,27.3,27.3,35.4,35.4,17.3,17.3,44.5,44.5
|
16 |
+
GLM3-turbo,43,43,,,,,,
|
17 |
+
GPT-3.5-turbo,37.06,36.83,37.56,39.25,39.42,39.77,41.96,42.15
|
18 |
+
Qwen-7B-Chat,36.28,36.5,33.18,33.51,41.58,40.59,31.48,31.46
|
19 |
+
ChatGLM3-6B,32.6,32.6,35.4,35.4,28.3,28.3,40.9,40.9
|
20 |
+
Chinese-Alpaca-2-13B,22.69,22.69,24.59,24.59,40.52,40.52,40.73,40.73
|
21 |
+
LLaMA-2-13B,25.43,27.16,29.17,29.99,36.56,36.15,37.7,39.02
|
22 |
+
Chinese-LLaMA-2-13B,17.98,17.98,17.83,17.83,31.66,31.66,36.24,36.24
|
23 |
+
ChatGLM2-6B,23.09,23.12,24.22,24.08,30.46,30.46,35.97,35.9
|
24 |
+
gemma_2b,25.6,25.6,28.3,28.3,19.1,19.1,35.5,35.5
|
25 |
+
InternLM-7B,27.81,27.81,19.95,19.95,24.18,24.18,35.35,35.35
|
26 |
+
Baichuan-13B-Chat,11.04,11.13,26.92,28.61,14.35,13.22,31.69,33.97
|
27 |
+
LLaMA-2-7B,24.09,23.47,28.69,29.26,29.94,30.03,31.35,31.93
|
28 |
+
|
df_process.ipynb
ADDED
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd\n",
|
10 |
+
"import os, json"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 2,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [
|
18 |
+
{
|
19 |
+
"data": {
|
20 |
+
"text/html": [
|
21 |
+
"<div>\n",
|
22 |
+
"<style scoped>\n",
|
23 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
24 |
+
" vertical-align: middle;\n",
|
25 |
+
" }\n",
|
26 |
+
"\n",
|
27 |
+
" .dataframe tbody tr th {\n",
|
28 |
+
" vertical-align: top;\n",
|
29 |
+
" }\n",
|
30 |
+
"\n",
|
31 |
+
" .dataframe thead th {\n",
|
32 |
+
" text-align: right;\n",
|
33 |
+
" }\n",
|
34 |
+
"</style>\n",
|
35 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
36 |
+
" <thead>\n",
|
37 |
+
" <tr style=\"text-align: right;\">\n",
|
38 |
+
" <th></th>\n",
|
39 |
+
" <th>name</th>\n",
|
40 |
+
" <th>zero_native</th>\n",
|
41 |
+
" <th>zero_self_con</th>\n",
|
42 |
+
" <th>zero_cot</th>\n",
|
43 |
+
" <th>zero_cot_self_con</th>\n",
|
44 |
+
" <th>few_native</th>\n",
|
45 |
+
" <th>few_self_con</th>\n",
|
46 |
+
" <th>few_cot</th>\n",
|
47 |
+
" <th>few_cot_self_con</th>\n",
|
48 |
+
" </tr>\n",
|
49 |
+
" </thead>\n",
|
50 |
+
" <tbody>\n",
|
51 |
+
" <tr>\n",
|
52 |
+
" <th>0</th>\n",
|
53 |
+
" <td>Baichuan-13B-Chat</td>\n",
|
54 |
+
" <td>18.3</td>\n",
|
55 |
+
" <td>20.4</td>\n",
|
56 |
+
" <td>28.6</td>\n",
|
57 |
+
" <td>37</td>\n",
|
58 |
+
" <td>24.1</td>\n",
|
59 |
+
" <td>26.7</td>\n",
|
60 |
+
" <td>18.200000</td>\n",
|
61 |
+
" <td>17.800000</td>\n",
|
62 |
+
" </tr>\n",
|
63 |
+
" <tr>\n",
|
64 |
+
" <th>1</th>\n",
|
65 |
+
" <td>Chinese-Alpaca-2-13B</td>\n",
|
66 |
+
" <td>37.7</td>\n",
|
67 |
+
" <td>37.7</td>\n",
|
68 |
+
" <td>49.7</td>\n",
|
69 |
+
" <td>49.7</td>\n",
|
70 |
+
" <td>48.6</td>\n",
|
71 |
+
" <td>48.6</td>\n",
|
72 |
+
" <td>50.500000</td>\n",
|
73 |
+
" <td>50.500000</td>\n",
|
74 |
+
" </tr>\n",
|
75 |
+
" <tr>\n",
|
76 |
+
" <th>2</th>\n",
|
77 |
+
" <td>GPT-3.5-turbo</td>\n",
|
78 |
+
" <td>66.6</td>\n",
|
79 |
+
" <td>66.8</td>\n",
|
80 |
+
" <td>69.6</td>\n",
|
81 |
+
" <td>72</td>\n",
|
82 |
+
" <td>68.3</td>\n",
|
83 |
+
" <td>68.3</td>\n",
|
84 |
+
" <td>70.900000</td>\n",
|
85 |
+
" <td>72.500000</td>\n",
|
86 |
+
" </tr>\n",
|
87 |
+
" <tr>\n",
|
88 |
+
" <th>3</th>\n",
|
89 |
+
" <td>LLaMA-2-13B</td>\n",
|
90 |
+
" <td>41.8</td>\n",
|
91 |
+
" <td>46.5</td>\n",
|
92 |
+
" <td>53.1</td>\n",
|
93 |
+
" <td>58.7</td>\n",
|
94 |
+
" <td>53.3</td>\n",
|
95 |
+
" <td>53</td>\n",
|
96 |
+
" <td>56.800000</td>\n",
|
97 |
+
" <td>61.000000</td>\n",
|
98 |
+
" </tr>\n",
|
99 |
+
" <tr>\n",
|
100 |
+
" <th>4</th>\n",
|
101 |
+
" <td>Qwen-7B-Chat</td>\n",
|
102 |
+
" <td>45.9</td>\n",
|
103 |
+
" <td>46</td>\n",
|
104 |
+
" <td>47.3</td>\n",
|
105 |
+
" <td>50.1</td>\n",
|
106 |
+
" <td>52.1</td>\n",
|
107 |
+
" <td>51</td>\n",
|
108 |
+
" <td>48.300000</td>\n",
|
109 |
+
" <td>49.800000</td>\n",
|
110 |
+
" </tr>\n",
|
111 |
+
" <tr>\n",
|
112 |
+
" <th>5</th>\n",
|
113 |
+
" <td>ChatGLM2-6B</td>\n",
|
114 |
+
" <td>24.8</td>\n",
|
115 |
+
" <td>24.7</td>\n",
|
116 |
+
" <td>36.6</td>\n",
|
117 |
+
" <td>36.5</td>\n",
|
118 |
+
" <td>37.6</td>\n",
|
119 |
+
" <td>37.6</td>\n",
|
120 |
+
" <td>40.500000</td>\n",
|
121 |
+
" <td>40.500000</td>\n",
|
122 |
+
" </tr>\n",
|
123 |
+
" <tr>\n",
|
124 |
+
" <th>6</th>\n",
|
125 |
+
" <td>Chinese-LLaMA-2-13B</td>\n",
|
126 |
+
" <td>29.4</td>\n",
|
127 |
+
" <td>29.4</td>\n",
|
128 |
+
" <td>37.8</td>\n",
|
129 |
+
" <td>37.8</td>\n",
|
130 |
+
" <td>40.4</td>\n",
|
131 |
+
" <td>40.4</td>\n",
|
132 |
+
" <td>28.800000</td>\n",
|
133 |
+
" <td>28.800000</td>\n",
|
134 |
+
" </tr>\n",
|
135 |
+
" <tr>\n",
|
136 |
+
" <th>7</th>\n",
|
137 |
+
" <td>InternLM-7B</td>\n",
|
138 |
+
" <td>38.7</td>\n",
|
139 |
+
" <td>38.7</td>\n",
|
140 |
+
" <td>43.9</td>\n",
|
141 |
+
" <td>43.9</td>\n",
|
142 |
+
" <td>45.2</td>\n",
|
143 |
+
" <td>45.2</td>\n",
|
144 |
+
" <td>51.400000</td>\n",
|
145 |
+
" <td>51.400000</td>\n",
|
146 |
+
" </tr>\n",
|
147 |
+
" <tr>\n",
|
148 |
+
" <th>8</th>\n",
|
149 |
+
" <td>LLaMA-2-7B</td>\n",
|
150 |
+
" <td>39.5</td>\n",
|
151 |
+
" <td>40</td>\n",
|
152 |
+
" <td>45.4</td>\n",
|
153 |
+
" <td>49.5</td>\n",
|
154 |
+
" <td>48.2</td>\n",
|
155 |
+
" <td>46.8</td>\n",
|
156 |
+
" <td>52.000000</td>\n",
|
157 |
+
" <td>55.200000</td>\n",
|
158 |
+
" </tr>\n",
|
159 |
+
" <tr>\n",
|
160 |
+
" <th>9</th>\n",
|
161 |
+
" <td>Baichuan2-13B-Chat</td>\n",
|
162 |
+
" <td>14.1</td>\n",
|
163 |
+
" <td>15.3</td>\n",
|
164 |
+
" <td>24.1</td>\n",
|
165 |
+
" <td>25.8</td>\n",
|
166 |
+
" <td>32.3</td>\n",
|
167 |
+
" <td>33.1</td>\n",
|
168 |
+
" <td>25.600000</td>\n",
|
169 |
+
" <td>27.700000</td>\n",
|
170 |
+
" </tr>\n",
|
171 |
+
" <tr>\n",
|
172 |
+
" <th>10</th>\n",
|
173 |
+
" <td>GPT-4</td>\n",
|
174 |
+
" <td>/</td>\n",
|
175 |
+
" <td>/</td>\n",
|
176 |
+
" <td>/</td>\n",
|
177 |
+
" <td>/</td>\n",
|
178 |
+
" <td>/</td>\n",
|
179 |
+
" <td>/</td>\n",
|
180 |
+
" <td>88.700000</td>\n",
|
181 |
+
" <td>88.700000</td>\n",
|
182 |
+
" </tr>\n",
|
183 |
+
" <tr>\n",
|
184 |
+
" <th>11</th>\n",
|
185 |
+
" <td>AquilaChat2-34B</td>\n",
|
186 |
+
" <td>36.63</td>\n",
|
187 |
+
" <td>36.63</td>\n",
|
188 |
+
" <td>44.83</td>\n",
|
189 |
+
" <td>44.83</td>\n",
|
190 |
+
" <td>46.65</td>\n",
|
191 |
+
" <td>46.65</td>\n",
|
192 |
+
" <td>NaN</td>\n",
|
193 |
+
" <td>NaN</td>\n",
|
194 |
+
" </tr>\n",
|
195 |
+
" <tr>\n",
|
196 |
+
" <th>12</th>\n",
|
197 |
+
" <td>Yi-34B-Chat</td>\n",
|
198 |
+
" <td>57.75</td>\n",
|
199 |
+
" <td>59.14</td>\n",
|
200 |
+
" <td>65.11</td>\n",
|
201 |
+
" <td>68.79</td>\n",
|
202 |
+
" <td>68.16</td>\n",
|
203 |
+
" <td>68.37</td>\n",
|
204 |
+
" <td>78.090000</td>\n",
|
205 |
+
" <td>80.060000</td>\n",
|
206 |
+
" </tr>\n",
|
207 |
+
" <tr>\n",
|
208 |
+
" <th>13</th>\n",
|
209 |
+
" <td>DevOps-Model-14B-Chat</td>\n",
|
210 |
+
" <td>30.69</td>\n",
|
211 |
+
" <td>30.59</td>\n",
|
212 |
+
" <td>55.77</td>\n",
|
213 |
+
" <td>63.63</td>\n",
|
214 |
+
" <td>63.85</td>\n",
|
215 |
+
" <td>61.96</td>\n",
|
216 |
+
" <td>41.150000</td>\n",
|
217 |
+
" <td>44.010000</td>\n",
|
218 |
+
" </tr>\n",
|
219 |
+
" <tr>\n",
|
220 |
+
" <th>14</th>\n",
|
221 |
+
" <td>Qwen-72B-Chat</td>\n",
|
222 |
+
" <td>70.41</td>\n",
|
223 |
+
" <td>70.50</td>\n",
|
224 |
+
" <td>72.38</td>\n",
|
225 |
+
" <td>72.56</td>\n",
|
226 |
+
" <td>70.32</td>\n",
|
227 |
+
" <td>70.32</td>\n",
|
228 |
+
" <td>70.130000</td>\n",
|
229 |
+
" <td>70.220000</td>\n",
|
230 |
+
" </tr>\n",
|
231 |
+
" <tr>\n",
|
232 |
+
" <th>15</th>\n",
|
233 |
+
" <td>Mistral-7B</td>\n",
|
234 |
+
" <td>29.27</td>\n",
|
235 |
+
" <td>29.27</td>\n",
|
236 |
+
" <td>46.30</td>\n",
|
237 |
+
" <td>46.30</td>\n",
|
238 |
+
" <td>47.22</td>\n",
|
239 |
+
" <td>47.22</td>\n",
|
240 |
+
" <td>45.580000</td>\n",
|
241 |
+
" <td>45.580000</td>\n",
|
242 |
+
" </tr>\n",
|
243 |
+
" <tr>\n",
|
244 |
+
" <th>16</th>\n",
|
245 |
+
" <td>Qwen-14B-Chat</td>\n",
|
246 |
+
" <td>43.78</td>\n",
|
247 |
+
" <td>47.81</td>\n",
|
248 |
+
" <td>56.58</td>\n",
|
249 |
+
" <td>59.40</td>\n",
|
250 |
+
" <td>62.09</td>\n",
|
251 |
+
" <td>59.70</td>\n",
|
252 |
+
" <td>49.060000</td>\n",
|
253 |
+
" <td>55.880000</td>\n",
|
254 |
+
" </tr>\n",
|
255 |
+
" <tr>\n",
|
256 |
+
" <th>17</th>\n",
|
257 |
+
" <td>LLaMA-2-70B-Chat</td>\n",
|
258 |
+
" <td>25.29</td>\n",
|
259 |
+
" <td>25.29</td>\n",
|
260 |
+
" <td>57.97</td>\n",
|
261 |
+
" <td>58.06</td>\n",
|
262 |
+
" <td>52.97</td>\n",
|
263 |
+
" <td>52.97</td>\n",
|
264 |
+
" <td>58.550000</td>\n",
|
265 |
+
" <td>58.550000</td>\n",
|
266 |
+
" </tr>\n",
|
267 |
+
" <tr>\n",
|
268 |
+
" <th>18</th>\n",
|
269 |
+
" <td>ERNIE-Bot-4.0</td>\n",
|
270 |
+
" <td>61.15</td>\n",
|
271 |
+
" <td>61.15</td>\n",
|
272 |
+
" <td>70.00</td>\n",
|
273 |
+
" <td>70.00</td>\n",
|
274 |
+
" <td>60.00</td>\n",
|
275 |
+
" <td>60.00</td>\n",
|
276 |
+
" <td>70.000000</td>\n",
|
277 |
+
" <td>70.000000</td>\n",
|
278 |
+
" </tr>\n",
|
279 |
+
" <tr>\n",
|
280 |
+
" <th>19</th>\n",
|
281 |
+
" <td>ChatGLM3-6B</td>\n",
|
282 |
+
" <td>43.38487973</td>\n",
|
283 |
+
" <td>43.38487973</td>\n",
|
284 |
+
" <td>44.58762887</td>\n",
|
285 |
+
" <td>44.58762887</td>\n",
|
286 |
+
" <td>42.09621993</td>\n",
|
287 |
+
" <td>42.09621993</td>\n",
|
288 |
+
" <td>43.470790</td>\n",
|
289 |
+
" <td>43.470790</td>\n",
|
290 |
+
" </tr>\n",
|
291 |
+
" <tr>\n",
|
292 |
+
" <th>20</th>\n",
|
293 |
+
" <td>InternLM2-Chat-20B</td>\n",
|
294 |
+
" <td>56.35738832</td>\n",
|
295 |
+
" <td>56.35738832</td>\n",
|
296 |
+
" <td>26.18025751</td>\n",
|
297 |
+
" <td>26.18025751</td>\n",
|
298 |
+
" <td>60.48109966</td>\n",
|
299 |
+
" <td>60.48109966</td>\n",
|
300 |
+
" <td>45.103093</td>\n",
|
301 |
+
" <td>45.103093</td>\n",
|
302 |
+
" </tr>\n",
|
303 |
+
" <tr>\n",
|
304 |
+
" <th>21</th>\n",
|
305 |
+
" <td>InternLM2-Chat-7B</td>\n",
|
306 |
+
" <td>49.74226804</td>\n",
|
307 |
+
" <td>49.74226804</td>\n",
|
308 |
+
" <td>56.18556701</td>\n",
|
309 |
+
" <td>56.18556701</td>\n",
|
310 |
+
" <td>48.19587629</td>\n",
|
311 |
+
" <td>48.19587629</td>\n",
|
312 |
+
" <td>49.742268</td>\n",
|
313 |
+
" <td>49.742268</td>\n",
|
314 |
+
" </tr>\n",
|
315 |
+
" <tr>\n",
|
316 |
+
" <th>22</th>\n",
|
317 |
+
" <td>gemma_2b</td>\n",
|
318 |
+
" <td>26.46048</td>\n",
|
319 |
+
" <td>26.46048</td>\n",
|
320 |
+
" <td>33.41924</td>\n",
|
321 |
+
" <td>33.41924</td>\n",
|
322 |
+
" <td>26.6323</td>\n",
|
323 |
+
" <td>26.6323</td>\n",
|
324 |
+
" <td>37.542960</td>\n",
|
325 |
+
" <td>37.542960</td>\n",
|
326 |
+
" </tr>\n",
|
327 |
+
" <tr>\n",
|
328 |
+
" <th>23</th>\n",
|
329 |
+
" <td>gemma_7b</td>\n",
|
330 |
+
" <td>25.08591</td>\n",
|
331 |
+
" <td>25.08591</td>\n",
|
332 |
+
" <td>50.85911</td>\n",
|
333 |
+
" <td>50.85911</td>\n",
|
334 |
+
" <td>30.24055</td>\n",
|
335 |
+
" <td>30.24055</td>\n",
|
336 |
+
" <td>51.557470</td>\n",
|
337 |
+
" <td>51.557470</td>\n",
|
338 |
+
" </tr>\n",
|
339 |
+
" <tr>\n",
|
340 |
+
" <th>24</th>\n",
|
341 |
+
" <td>qwen1.5-14b-base</td>\n",
|
342 |
+
" <td>34.87973</td>\n",
|
343 |
+
" <td>34.87973</td>\n",
|
344 |
+
" <td>60.82474</td>\n",
|
345 |
+
" <td>60.82474</td>\n",
|
346 |
+
" <td>65.54983</td>\n",
|
347 |
+
" <td>65.54983</td>\n",
|
348 |
+
" <td>47.079040</td>\n",
|
349 |
+
" <td>47.079040</td>\n",
|
350 |
+
" </tr>\n",
|
351 |
+
" <tr>\n",
|
352 |
+
" <th>25</th>\n",
|
353 |
+
" <td>qwen1.5-14b-chat</td>\n",
|
354 |
+
" <td>54.89691</td>\n",
|
355 |
+
" <td>56.4433</td>\n",
|
356 |
+
" <td>64.08935</td>\n",
|
357 |
+
" <td>67.09622</td>\n",
|
358 |
+
" <td>52.23368</td>\n",
|
359 |
+
" <td>53.52234</td>\n",
|
360 |
+
" <td>59.536080</td>\n",
|
361 |
+
" <td>64.175260</td>\n",
|
362 |
+
" </tr>\n",
|
363 |
+
" </tbody>\n",
|
364 |
+
"</table>\n",
|
365 |
+
"</div>"
|
366 |
+
],
|
367 |
+
"text/plain": [
|
368 |
+
" name zero_native zero_self_con zero_cot \\\n",
|
369 |
+
"0 Baichuan-13B-Chat 18.3 20.4 28.6 \n",
|
370 |
+
"1 Chinese-Alpaca-2-13B 37.7 37.7 49.7 \n",
|
371 |
+
"2 GPT-3.5-turbo 66.6 66.8 69.6 \n",
|
372 |
+
"3 LLaMA-2-13B 41.8 46.5 53.1 \n",
|
373 |
+
"4 Qwen-7B-Chat 45.9 46 47.3 \n",
|
374 |
+
"5 ChatGLM2-6B 24.8 24.7 36.6 \n",
|
375 |
+
"6 Chinese-LLaMA-2-13B 29.4 29.4 37.8 \n",
|
376 |
+
"7 InternLM-7B 38.7 38.7 43.9 \n",
|
377 |
+
"8 LLaMA-2-7B 39.5 40 45.4 \n",
|
378 |
+
"9 Baichuan2-13B-Chat 14.1 15.3 24.1 \n",
|
379 |
+
"10 GPT-4 / / / \n",
|
380 |
+
"11 AquilaChat2-34B 36.63 36.63 44.83 \n",
|
381 |
+
"12 Yi-34B-Chat 57.75 59.14 65.11 \n",
|
382 |
+
"13 DevOps-Model-14B-Chat 30.69 30.59 55.77 \n",
|
383 |
+
"14 Qwen-72B-Chat 70.41 70.50 72.38 \n",
|
384 |
+
"15 Mistral-7B 29.27 29.27 46.30 \n",
|
385 |
+
"16 Qwen-14B-Chat 43.78 47.81 56.58 \n",
|
386 |
+
"17 LLaMA-2-70B-Chat 25.29 25.29 57.97 \n",
|
387 |
+
"18 ERNIE-Bot-4.0 61.15 61.15 70.00 \n",
|
388 |
+
"19 ChatGLM3-6B 43.38487973 43.38487973 44.58762887 \n",
|
389 |
+
"20 InternLM2-Chat-20B 56.35738832 56.35738832 26.18025751 \n",
|
390 |
+
"21 InternLM2-Chat-7B 49.74226804 49.74226804 56.18556701 \n",
|
391 |
+
"22 gemma_2b 26.46048 26.46048 33.41924 \n",
|
392 |
+
"23 gemma_7b 25.08591 25.08591 50.85911 \n",
|
393 |
+
"24 qwen1.5-14b-base 34.87973 34.87973 60.82474 \n",
|
394 |
+
"25 qwen1.5-14b-chat 54.89691 56.4433 64.08935 \n",
|
395 |
+
"\n",
|
396 |
+
" zero_cot_self_con few_native few_self_con few_cot few_cot_self_con \n",
|
397 |
+
"0 37 24.1 26.7 18.200000 17.800000 \n",
|
398 |
+
"1 49.7 48.6 48.6 50.500000 50.500000 \n",
|
399 |
+
"2 72 68.3 68.3 70.900000 72.500000 \n",
|
400 |
+
"3 58.7 53.3 53 56.800000 61.000000 \n",
|
401 |
+
"4 50.1 52.1 51 48.300000 49.800000 \n",
|
402 |
+
"5 36.5 37.6 37.6 40.500000 40.500000 \n",
|
403 |
+
"6 37.8 40.4 40.4 28.800000 28.800000 \n",
|
404 |
+
"7 43.9 45.2 45.2 51.400000 51.400000 \n",
|
405 |
+
"8 49.5 48.2 46.8 52.000000 55.200000 \n",
|
406 |
+
"9 25.8 32.3 33.1 25.600000 27.700000 \n",
|
407 |
+
"10 / / / 88.700000 88.700000 \n",
|
408 |
+
"11 44.83 46.65 46.65 NaN NaN \n",
|
409 |
+
"12 68.79 68.16 68.37 78.090000 80.060000 \n",
|
410 |
+
"13 63.63 63.85 61.96 41.150000 44.010000 \n",
|
411 |
+
"14 72.56 70.32 70.32 70.130000 70.220000 \n",
|
412 |
+
"15 46.30 47.22 47.22 45.580000 45.580000 \n",
|
413 |
+
"16 59.40 62.09 59.70 49.060000 55.880000 \n",
|
414 |
+
"17 58.06 52.97 52.97 58.550000 58.550000 \n",
|
415 |
+
"18 70.00 60.00 60.00 70.000000 70.000000 \n",
|
416 |
+
"19 44.58762887 42.09621993 42.09621993 43.470790 43.470790 \n",
|
417 |
+
"20 26.18025751 60.48109966 60.48109966 45.103093 45.103093 \n",
|
418 |
+
"21 56.18556701 48.19587629 48.19587629 49.742268 49.742268 \n",
|
419 |
+
"22 33.41924 26.6323 26.6323 37.542960 37.542960 \n",
|
420 |
+
"23 50.85911 30.24055 30.24055 51.557470 51.557470 \n",
|
421 |
+
"24 60.82474 65.54983 65.54983 47.079040 47.079040 \n",
|
422 |
+
"25 67.09622 52.23368 53.52234 59.536080 64.175260 "
|
423 |
+
]
|
424 |
+
},
|
425 |
+
"execution_count": 2,
|
426 |
+
"metadata": {},
|
427 |
+
"output_type": "execute_result"
|
428 |
+
}
|
429 |
+
],
|
430 |
+
"source": [
|
431 |
+
"df = pd.read_csv(\"./data/network_en_mc.csv\")\n",
|
432 |
+
"df"
|
433 |
+
]
|
434 |
+
},
|
435 |
+
{
|
436 |
+
"cell_type": "code",
|
437 |
+
"execution_count": 20,
|
438 |
+
"metadata": {},
|
439 |
+
"outputs": [
|
440 |
+
{
|
441 |
+
"data": {
|
442 |
+
"text/html": [
|
443 |
+
"<div>\n",
|
444 |
+
"<style scoped>\n",
|
445 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
446 |
+
" vertical-align: middle;\n",
|
447 |
+
" }\n",
|
448 |
+
"\n",
|
449 |
+
" .dataframe tbody tr th {\n",
|
450 |
+
" vertical-align: top;\n",
|
451 |
+
" }\n",
|
452 |
+
"\n",
|
453 |
+
" .dataframe thead th {\n",
|
454 |
+
" text-align: right;\n",
|
455 |
+
" }\n",
|
456 |
+
"</style>\n",
|
457 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
458 |
+
" <thead>\n",
|
459 |
+
" <tr style=\"text-align: right;\">\n",
|
460 |
+
" <th></th>\n",
|
461 |
+
" <th>Naive</th>\n",
|
462 |
+
" <th>SC</th>\n",
|
463 |
+
" <th>CoT</th>\n",
|
464 |
+
" <th>CoT+SC</th>\n",
|
465 |
+
" </tr>\n",
|
466 |
+
" </thead>\n",
|
467 |
+
" <tbody>\n",
|
468 |
+
" <tr>\n",
|
469 |
+
" <th>0</th>\n",
|
470 |
+
" <td>NaN</td>\n",
|
471 |
+
" <td>NaN</td>\n",
|
472 |
+
" <td>NaN</td>\n",
|
473 |
+
" <td>NaN</td>\n",
|
474 |
+
" </tr>\n",
|
475 |
+
" <tr>\n",
|
476 |
+
" <th>1</th>\n",
|
477 |
+
" <td>57.75</td>\n",
|
478 |
+
" <td>59.14</td>\n",
|
479 |
+
" <td>65.11</td>\n",
|
480 |
+
" <td>68.79</td>\n",
|
481 |
+
" </tr>\n",
|
482 |
+
" <tr>\n",
|
483 |
+
" <th>2</th>\n",
|
484 |
+
" <td>70.41</td>\n",
|
485 |
+
" <td>70.50</td>\n",
|
486 |
+
" <td>72.38</td>\n",
|
487 |
+
" <td>72.56</td>\n",
|
488 |
+
" </tr>\n",
|
489 |
+
" <tr>\n",
|
490 |
+
" <th>3</th>\n",
|
491 |
+
" <td>66.60</td>\n",
|
492 |
+
" <td>66.80</td>\n",
|
493 |
+
" <td>69.60</td>\n",
|
494 |
+
" <td>72.00</td>\n",
|
495 |
+
" </tr>\n",
|
496 |
+
" <tr>\n",
|
497 |
+
" <th>4</th>\n",
|
498 |
+
" <td>61.15</td>\n",
|
499 |
+
" <td>61.15</td>\n",
|
500 |
+
" <td>70.00</td>\n",
|
501 |
+
" <td>70.00</td>\n",
|
502 |
+
" </tr>\n",
|
503 |
+
" <tr>\n",
|
504 |
+
" <th>5</th>\n",
|
505 |
+
" <td>54.90</td>\n",
|
506 |
+
" <td>56.44</td>\n",
|
507 |
+
" <td>64.09</td>\n",
|
508 |
+
" <td>67.10</td>\n",
|
509 |
+
" </tr>\n",
|
510 |
+
" <tr>\n",
|
511 |
+
" <th>6</th>\n",
|
512 |
+
" <td>34.88</td>\n",
|
513 |
+
" <td>34.88</td>\n",
|
514 |
+
" <td>60.82</td>\n",
|
515 |
+
" <td>60.82</td>\n",
|
516 |
+
" </tr>\n",
|
517 |
+
" <tr>\n",
|
518 |
+
" <th>7</th>\n",
|
519 |
+
" <td>30.69</td>\n",
|
520 |
+
" <td>30.59</td>\n",
|
521 |
+
" <td>55.77</td>\n",
|
522 |
+
" <td>63.63</td>\n",
|
523 |
+
" </tr>\n",
|
524 |
+
" <tr>\n",
|
525 |
+
" <th>8</th>\n",
|
526 |
+
" <td>43.78</td>\n",
|
527 |
+
" <td>47.81</td>\n",
|
528 |
+
" <td>56.58</td>\n",
|
529 |
+
" <td>59.40</td>\n",
|
530 |
+
" </tr>\n",
|
531 |
+
" <tr>\n",
|
532 |
+
" <th>9</th>\n",
|
533 |
+
" <td>41.80</td>\n",
|
534 |
+
" <td>46.50</td>\n",
|
535 |
+
" <td>53.10</td>\n",
|
536 |
+
" <td>58.70</td>\n",
|
537 |
+
" </tr>\n",
|
538 |
+
" <tr>\n",
|
539 |
+
" <th>10</th>\n",
|
540 |
+
" <td>56.36</td>\n",
|
541 |
+
" <td>56.36</td>\n",
|
542 |
+
" <td>26.18</td>\n",
|
543 |
+
" <td>26.18</td>\n",
|
544 |
+
" </tr>\n",
|
545 |
+
" <tr>\n",
|
546 |
+
" <th>11</th>\n",
|
547 |
+
" <td>25.29</td>\n",
|
548 |
+
" <td>25.29</td>\n",
|
549 |
+
" <td>57.97</td>\n",
|
550 |
+
" <td>58.06</td>\n",
|
551 |
+
" </tr>\n",
|
552 |
+
" <tr>\n",
|
553 |
+
" <th>12</th>\n",
|
554 |
+
" <td>49.74</td>\n",
|
555 |
+
" <td>49.74</td>\n",
|
556 |
+
" <td>56.19</td>\n",
|
557 |
+
" <td>56.19</td>\n",
|
558 |
+
" </tr>\n",
|
559 |
+
" <tr>\n",
|
560 |
+
" <th>13</th>\n",
|
561 |
+
" <td>39.50</td>\n",
|
562 |
+
" <td>40.00</td>\n",
|
563 |
+
" <td>45.40</td>\n",
|
564 |
+
" <td>49.50</td>\n",
|
565 |
+
" </tr>\n",
|
566 |
+
" <tr>\n",
|
567 |
+
" <th>14</th>\n",
|
568 |
+
" <td>45.90</td>\n",
|
569 |
+
" <td>46.00</td>\n",
|
570 |
+
" <td>47.30</td>\n",
|
571 |
+
" <td>50.10</td>\n",
|
572 |
+
" </tr>\n",
|
573 |
+
" <tr>\n",
|
574 |
+
" <th>15</th>\n",
|
575 |
+
" <td>25.09</td>\n",
|
576 |
+
" <td>25.09</td>\n",
|
577 |
+
" <td>50.86</td>\n",
|
578 |
+
" <td>50.86</td>\n",
|
579 |
+
" </tr>\n",
|
580 |
+
" <tr>\n",
|
581 |
+
" <th>16</th>\n",
|
582 |
+
" <td>38.70</td>\n",
|
583 |
+
" <td>38.70</td>\n",
|
584 |
+
" <td>43.90</td>\n",
|
585 |
+
" <td>43.90</td>\n",
|
586 |
+
" </tr>\n",
|
587 |
+
" <tr>\n",
|
588 |
+
" <th>17</th>\n",
|
589 |
+
" <td>37.70</td>\n",
|
590 |
+
" <td>37.70</td>\n",
|
591 |
+
" <td>49.70</td>\n",
|
592 |
+
" <td>49.70</td>\n",
|
593 |
+
" </tr>\n",
|
594 |
+
" <tr>\n",
|
595 |
+
" <th>18</th>\n",
|
596 |
+
" <td>29.27</td>\n",
|
597 |
+
" <td>29.27</td>\n",
|
598 |
+
" <td>46.30</td>\n",
|
599 |
+
" <td>46.30</td>\n",
|
600 |
+
" </tr>\n",
|
601 |
+
" <tr>\n",
|
602 |
+
" <th>19</th>\n",
|
603 |
+
" <td>36.63</td>\n",
|
604 |
+
" <td>36.63</td>\n",
|
605 |
+
" <td>44.83</td>\n",
|
606 |
+
" <td>44.83</td>\n",
|
607 |
+
" </tr>\n",
|
608 |
+
" <tr>\n",
|
609 |
+
" <th>20</th>\n",
|
610 |
+
" <td>43.38</td>\n",
|
611 |
+
" <td>43.38</td>\n",
|
612 |
+
" <td>44.59</td>\n",
|
613 |
+
" <td>44.59</td>\n",
|
614 |
+
" </tr>\n",
|
615 |
+
" <tr>\n",
|
616 |
+
" <th>21</th>\n",
|
617 |
+
" <td>24.80</td>\n",
|
618 |
+
" <td>24.70</td>\n",
|
619 |
+
" <td>36.60</td>\n",
|
620 |
+
" <td>36.50</td>\n",
|
621 |
+
" </tr>\n",
|
622 |
+
" <tr>\n",
|
623 |
+
" <th>22</th>\n",
|
624 |
+
" <td>29.40</td>\n",
|
625 |
+
" <td>29.40</td>\n",
|
626 |
+
" <td>37.80</td>\n",
|
627 |
+
" <td>37.80</td>\n",
|
628 |
+
" </tr>\n",
|
629 |
+
" <tr>\n",
|
630 |
+
" <th>23</th>\n",
|
631 |
+
" <td>26.46</td>\n",
|
632 |
+
" <td>26.46</td>\n",
|
633 |
+
" <td>33.42</td>\n",
|
634 |
+
" <td>33.42</td>\n",
|
635 |
+
" </tr>\n",
|
636 |
+
" <tr>\n",
|
637 |
+
" <th>24</th>\n",
|
638 |
+
" <td>18.30</td>\n",
|
639 |
+
" <td>20.40</td>\n",
|
640 |
+
" <td>28.60</td>\n",
|
641 |
+
" <td>37.00</td>\n",
|
642 |
+
" </tr>\n",
|
643 |
+
" <tr>\n",
|
644 |
+
" <th>25</th>\n",
|
645 |
+
" <td>14.10</td>\n",
|
646 |
+
" <td>15.30</td>\n",
|
647 |
+
" <td>24.10</td>\n",
|
648 |
+
" <td>25.80</td>\n",
|
649 |
+
" </tr>\n",
|
650 |
+
" </tbody>\n",
|
651 |
+
"</table>\n",
|
652 |
+
"</div>"
|
653 |
+
],
|
654 |
+
"text/plain": [
|
655 |
+
" Naive SC CoT CoT+SC\n",
|
656 |
+
"0 NaN NaN NaN NaN\n",
|
657 |
+
"1 57.75 59.14 65.11 68.79\n",
|
658 |
+
"2 70.41 70.50 72.38 72.56\n",
|
659 |
+
"3 66.60 66.80 69.60 72.00\n",
|
660 |
+
"4 61.15 61.15 70.00 70.00\n",
|
661 |
+
"5 54.90 56.44 64.09 67.10\n",
|
662 |
+
"6 34.88 34.88 60.82 60.82\n",
|
663 |
+
"7 30.69 30.59 55.77 63.63\n",
|
664 |
+
"8 43.78 47.81 56.58 59.40\n",
|
665 |
+
"9 41.80 46.50 53.10 58.70\n",
|
666 |
+
"10 56.36 56.36 26.18 26.18\n",
|
667 |
+
"11 25.29 25.29 57.97 58.06\n",
|
668 |
+
"12 49.74 49.74 56.19 56.19\n",
|
669 |
+
"13 39.50 40.00 45.40 49.50\n",
|
670 |
+
"14 45.90 46.00 47.30 50.10\n",
|
671 |
+
"15 25.09 25.09 50.86 50.86\n",
|
672 |
+
"16 38.70 38.70 43.90 43.90\n",
|
673 |
+
"17 37.70 37.70 49.70 49.70\n",
|
674 |
+
"18 29.27 29.27 46.30 46.30\n",
|
675 |
+
"19 36.63 36.63 44.83 44.83\n",
|
676 |
+
"20 43.38 43.38 44.59 44.59\n",
|
677 |
+
"21 24.80 24.70 36.60 36.50\n",
|
678 |
+
"22 29.40 29.40 37.80 37.80\n",
|
679 |
+
"23 26.46 26.46 33.42 33.42\n",
|
680 |
+
"24 18.30 20.40 28.60 37.00\n",
|
681 |
+
"25 14.10 15.30 24.10 25.80"
|
682 |
+
]
|
683 |
+
},
|
684 |
+
"execution_count": 20,
|
685 |
+
"metadata": {},
|
686 |
+
"output_type": "execute_result"
|
687 |
+
}
|
688 |
+
],
|
689 |
+
"source": [
|
690 |
+
"def process_mc_df(df):\n",
|
691 |
+
" # 将name列重命名为Model\n",
|
692 |
+
" df = df.rename(columns={\"name\": \"Model\"})\n",
|
693 |
+
" # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency\n",
|
694 |
+
" df = df.set_index(\"Model\")\n",
|
695 |
+
" # df = df.stack().unstack()\n",
|
696 |
+
" df.columns = pd.MultiIndex.from_tuples([(\"Zeroshot\", \"Naive\"), (\"Zeroshot\", \"SC\"), (\"Zeroshot\", \"CoT\"), (\"Zeroshot\", \"CoT+SC\"), (\"Fewshot\", \"Naive\"), (\"Fewshot\", \"SC\"), (\"Fewshot\", \"CoT\"), (\"Fewshot\", \"CoT+SC\")])\n",
|
697 |
+
" # 将除了Model列之外的列的value转换为数值型,失败的为NaN\n",
|
698 |
+
" df = df.apply(pd.to_numeric, errors=\"coerce\")\n",
|
699 |
+
" # 显示小数点后两位\n",
|
700 |
+
" df = df.round(2)\n",
|
701 |
+
" # 给每一行添加���列BestScore\n",
|
702 |
+
" df[\"BestScore\"] = df.max(axis=1)\n",
|
703 |
+
" # 根据BestScore给df排序\n",
|
704 |
+
" df = df.sort_values(by=\"BestScore\", ascending=False)\n",
|
705 |
+
" # \n",
|
706 |
+
" df = df.reset_index()\n",
|
707 |
+
" return df\n",
|
708 |
+
"\n",
|
709 |
+
"processed = process_mc_df(df)\n",
|
710 |
+
"processed.columns\n",
|
711 |
+
"processed['Zeroshot']"
|
712 |
+
]
|
713 |
+
},
|
714 |
+
{
|
715 |
+
"cell_type": "code",
|
716 |
+
"execution_count": null,
|
717 |
+
"metadata": {},
|
718 |
+
"outputs": [],
|
719 |
+
"source": []
|
720 |
+
}
|
721 |
+
],
|
722 |
+
"metadata": {
|
723 |
+
"kernelspec": {
|
724 |
+
"display_name": "opencompass",
|
725 |
+
"language": "python",
|
726 |
+
"name": "python3"
|
727 |
+
},
|
728 |
+
"language_info": {
|
729 |
+
"codemirror_mode": {
|
730 |
+
"name": "ipython",
|
731 |
+
"version": 3
|
732 |
+
},
|
733 |
+
"file_extension": ".py",
|
734 |
+
"mimetype": "text/x-python",
|
735 |
+
"name": "python",
|
736 |
+
"nbconvert_exporter": "python",
|
737 |
+
"pygments_lexer": "ipython3",
|
738 |
+
"version": "3.10.14"
|
739 |
+
}
|
740 |
+
},
|
741 |
+
"nbformat": 4,
|
742 |
+
"nbformat_minor": 2
|
743 |
+
}
|
leaderboards.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
eng_leaderboards = [
|
2 |
+
('zte', ['mc']),
|
3 |
+
('lenovo', ['mc']),
|
4 |
+
('oracle', ['mc']),
|
5 |
+
('network', ['mc', 'qa']),
|
6 |
+
]
|
7 |
+
|
8 |
+
chi_leaderboards = [
|
9 |
+
('huaweicloud', ['mc']),
|
10 |
+
('gtja', ['mc']),
|
11 |
+
('zjyd', ['mc', 'qa']),
|
12 |
+
('network', ['mc', 'qa']),
|
13 |
+
('pufa', ['mc']),
|
14 |
+
('zabbix', ['mc']),
|
15 |
+
('dfcdata', ['mc']),
|
16 |
+
('zte', ['mc']),
|
17 |
+
('oracle', ['mc']),
|
18 |
+
('tencent', ['qa']),
|
19 |
+
('bosc', ['mc']),
|
20 |
+
('rzy', ['mc']),
|
21 |
+
('lenovo', ['mc']),
|
22 |
+
]
|
opseval_datasets.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets_abbr = [
|
2 |
+
'huaweicloud',
|
3 |
+
'gtja',
|
4 |
+
'zjyd',
|
5 |
+
'network',
|
6 |
+
'pufa',
|
7 |
+
'zabbix',
|
8 |
+
'dfcdata',
|
9 |
+
'zte',
|
10 |
+
'oracle',
|
11 |
+
'tencent',
|
12 |
+
'bosc',
|
13 |
+
'rzy',
|
14 |
+
'lenovo'
|
15 |
+
]
|
16 |
+
|
17 |
+
datasets_zh = [
|
18 |
+
'5G通信运维(华为核心网)',
|
19 |
+
'证券信息系统运维(国泰君安)',
|
20 |
+
'中国移动浙江公司',
|
21 |
+
'有线网络运维(清华Netman)',
|
22 |
+
'金融IT运维(浦发银行)',
|
23 |
+
'运维监控能力测评(Zabbix中国宏时数据)',
|
24 |
+
'数据库运维(基石数据)',
|
25 |
+
'5G通信网络运维(中兴通信)',
|
26 |
+
'Oracle数据库运维(中亦科技)',
|
27 |
+
'DevOps能力评测(腾讯)',
|
28 |
+
'金融信创系统运维(上海银行)',
|
29 |
+
'日志分析能力评测(日志易)',
|
30 |
+
'混合云建设与运维(联想集团)'
|
31 |
+
]
|
32 |
+
|
33 |
+
datasets_en = [
|
34 |
+
"5G Telecommunications",
|
35 |
+
"Securities Information System",
|
36 |
+
"China Mobile Zhejiang",
|
37 |
+
"Wired Network Operations",
|
38 |
+
"Financial IT",
|
39 |
+
"Operations Monitoring Capability",
|
40 |
+
"Database",
|
41 |
+
"5G Telecommunications Network",
|
42 |
+
"Oracle Database",
|
43 |
+
"DevOps Capability",
|
44 |
+
"Financial New Generation System",
|
45 |
+
"Log Analysis",
|
46 |
+
"Hybrid Cloud Construction and Operations"
|
47 |
+
]
|
48 |
+
|
49 |
+
dataset_abbr_zh_dict = {
|
50 |
+
da: dz for da, dz in zip(datasets_abbr, datasets_zh)
|
51 |
+
}
|
52 |
+
|
53 |
+
dataset_abbr_en_dict = {
|
54 |
+
da: de for da, de in zip(datasets_abbr, datasets_en)
|
55 |
+
}
|
56 |
+
|
57 |
+
dataset_zh_en_dict = {
|
58 |
+
dz: de for dz, de in zip(datasets_zh, datasets_en)
|
59 |
+
}
|
60 |
+
|
61 |
+
dataset_en_zh_dict = {
|
62 |
+
de: dz for dz, de in zip(datasets_zh, datasets_en)
|
63 |
+
}
|