Junetheriver commited on
Commit
a6d507f
1 Parent(s): 45c0614

added leaderboards

Browse files
app.py CHANGED
@@ -9,23 +9,124 @@ import matplotlib.pyplot as plt
9
  import plotly.graph_objects as go
10
  from apscheduler.schedulers.background import BackgroundScheduler
11
  from texts import INTRODUCTION_TEXT, TITLE
 
 
12
 
13
- df_lang = {
14
- 'English': pd.read_csv("./leaderboard/wired_network_en.csv"),
15
- 'Chinese': pd.read_csv("./leaderboard/wired_network_zh.csv"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
 
18
- def create_lang_leader_board(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  headers = df.columns
20
- types = ["str"] + ["number"] * (len(headers) - 1)
21
 
22
  return gr.components.Dataframe(
23
  value=df.values.tolist(),
24
- headers=[col_name for col_name in headers],
25
- datatype=types,
26
  # max_rows=10,
27
  )
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def launch_gradio():
31
  demo = gr.Blocks()
@@ -33,12 +134,14 @@ def launch_gradio():
33
  with demo:
34
  gr.HTML(TITLE)
35
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
36
- for key, df in df_lang.items():
37
  with gr.Tab(key):
38
- create_lang_leader_board(df)
39
 
40
  demo.launch()
41
 
 
 
42
  scheduler = BackgroundScheduler()
43
  scheduler.add_job(launch_gradio, 'interval', hours=1)
44
  scheduler.start()
 
9
  import plotly.graph_objects as go
10
  from apscheduler.schedulers.background import BackgroundScheduler
11
  from texts import INTRODUCTION_TEXT, TITLE
12
+ from leaderboards import eng_leaderboards, chi_leaderboards
13
+ from opseval_datasets import *
14
 
15
+
16
+ # df_lang = {
17
+ # 'English': pd.read_csv("./leaderboard/wired_network_en.csv"),
18
+ # 'Chinese': pd.read_csv("./leaderboard/wired_network_zh.csv"),
19
+ # }
20
+
21
+
22
+
23
+ def create_lang_tabs(lang, lang_cates):
24
+ df_dict = {}
25
+ for dataset, cates in lang_cates:
26
+ dataset_dt = {}
27
+ for cat in cates:
28
+ leaderboard_df = pd.read_csv(f'./data/{dataset}_{lang}_{cat}.csv')
29
+ dataset_dt[cat] = leaderboard_df
30
+ df_dict[dataset] = dataset_dt
31
+ return df_dict
32
+
33
+
34
+ dict_lang = {
35
+ 'English': create_lang_tabs('en', eng_leaderboards),
36
+ 'Chinese': create_lang_tabs('zh', chi_leaderboards)
37
  }
38
 
39
+ def process_mc_df(df, shot=None):
40
+ # 将name列重命名为Model
41
+ df = df.rename(columns={"name": "Model"})
42
+ # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency
43
+ df = df.set_index("Model")
44
+ # df = df.stack().unstack()
45
+ df.columns = pd.MultiIndex.from_tuples([("Zeroshot", "Naive"), ("Zeroshot", "SC"), ("Zeroshot", "CoT"), ("Zeroshot", "CoT+SC"), ("Fewshot", "Naive"), ("Fewshot", "SC"), ("Fewshot", "CoT"), ("Fewshot", "CoT+SC")])
46
+ # 保留shot的列,比如如果shot=Zeroshot那么只有Zeroshot的列会被保留
47
+ if shot:
48
+ df = df[shot]
49
+ # 将除了Model列之外的列的value转换为数值型,失败的为NaN
50
+ df = df.apply(pd.to_numeric, errors="coerce")
51
+ # 保留小数点后两位
52
+ df = df.round(2)
53
+ # 给每一行添加一列BestScore
54
+ df["BestScore"] = df.max(axis=1)
55
+ # 根据BestScore给df排序
56
+ df = df.sort_values(by="BestScore", ascending=False)
57
+ # reset_index
58
+ df = df.reset_index()
59
+ return df
60
+
61
+ def dataframe_to_gradio(df, is_mc=True, shot=None):
62
+
63
+ if is_mc:
64
+ df = process_mc_df(df, shot)
65
  headers = df.columns
66
+ # types = ["str"] + ["number"] * (len(headers) - 1)
67
 
68
  return gr.components.Dataframe(
69
  value=df.values.tolist(),
70
+ headers=[label for label in df.columns],
71
+ # datatype=types,
72
  # max_rows=10,
73
  )
74
 
75
+ def plot_radar_chart(df, attributes):
76
+ fig = go.Figure()
77
+
78
+ for index, row in df.iterrows():
79
+ model = row['Model']
80
+ values = row[attributes].tolist()
81
+ fig.add_trace(go.Scatterpolar(
82
+ r=values,
83
+ theta=attributes,
84
+ fill='toself',
85
+ name=model
86
+ ))
87
+
88
+ fig.update_layout(
89
+ title="OpsEval",
90
+ polar=dict(
91
+ radialaxis=dict(
92
+ visible=True,
93
+ range=[0, 0.9]
94
+ )),
95
+ showlegend=True
96
+ )
97
+
98
+ return fig
99
+
100
+
101
+ def create_lang_leader_board(lang_dict):
102
+
103
+ best_scores = {}
104
+ best_plot_datasets = []
105
+ for dataset, value in lang_dict.items():
106
+ for cat, df in value.items():
107
+ if cat == 'mc':
108
+ processed = process_mc_df(df)
109
+ bestscores = processed['BestScore']
110
+ best_scores[dataset] = bestscores
111
+ best_plot_datasets.append(dataset)
112
+ best_df = pd.DataFrame(best_scores)
113
+ # print(best_scores)
114
+ # print(best_df)
115
+ # plot = plot_radar_chart(pd.DataFrame(best_scores), best_plot_datasets)
116
+ # gr.Plot(plot)
117
+
118
+ for dataset, value in lang_dict.items():
119
+ with gr.Tab(dataset_abbr_en_dict[dataset]):
120
+ for cat, df in value.items():
121
+ if cat == 'mc':
122
+ for shot in ['Zeroshot', 'Fewshot']:
123
+ with gr.Tab(f'Multiple Choice Question ({shot})'):
124
+ dataframe_to_gradio(df, is_mc=True, shot=shot)
125
+ else:
126
+ with gr.Tab('Question Answering'):
127
+ dataframe_to_gradio(df, is_mc=False)
128
+
129
+
130
 
131
  def launch_gradio():
132
  demo = gr.Blocks()
 
134
  with demo:
135
  gr.HTML(TITLE)
136
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
137
+ for key, dict in dict_lang.items():
138
  with gr.Tab(key):
139
+ create_lang_leader_board(dict)
140
 
141
  demo.launch()
142
 
143
+ pd.set_option('display.float_format', '{:.02f}'.format)
144
+
145
  scheduler = BackgroundScheduler()
146
  scheduler.add_job(launch_gradio, 'interval', hours=1)
147
  scheduler.start()
data/bosc_zh_mc.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,52.50,52.50,62.50,62.50,57.50,57.50,57.50,57.50
3
+ Yi-34B-Chat,50.00,50.00,52.50,55.00,55.00,55.00,60.00,67.50
4
+ DevOps-Model-14B-Chat,50.00,50.00,55.00,62.50,35.00,27.50,37.50,52.50
5
+ LLaMA-2-7B,45.00,45.00,45.00,45.00,32.50,32.50,45.00,45.00
6
+ Qwen-72B-Chat,45.00,45.00,60.00,60.00,50.00,50.00,47.50,47.50
7
+ GPT-3.5-turbo,40.00,40.00,50.00,55.00,50.00,47.50,55.00,55.00
8
+ ERNIE-Bot-4.0,52.50,52.50,57.50,57.50,57.50,57.50,60.00,60.00
9
+ Mistral-7B,20.00,20.00,50.00,50.00,0.00,0.00,37.50,37.50
10
+ LLaMA-2-13B,50.00,50.00,42.50,42.50,42.50,42.50,50.00,50.00
11
+ Baichuan2-13B-Chat,37.50,37.50,42.50,45.00,37.50,40.00,47.50,52.50
12
+ Qwen-14B-Chat,50.00,47.50,55.00,57.50,47.50,45.00,50.00,47.50
13
+ LLaMA-2-70B-Chat,25.00,25.00,45.00,45.00,0.00,0.00,57.50,57.50
14
+ ChatGLM3-6B,47.5,47.5,45,45,35,35,50,50
15
+ InternLM2-Chat-20B,47.5,47.5,,,47.5,47.5,,
16
+ InternLM2-Chat-7B,55,55,62.5,62.5,60,60,57.5,57.5
17
+ gemma_2b,32.5,32.5,40,40 ,37.5,37.5,40,40
18
+ gemma_7b,40,40,50,50 ,32.5,32.5,62.5,62.5
19
+ qwen1.5-14b-base,47.5,47.5,45,45 ,47.5,47.5,50,50
20
+ qwen1.5-14b-chat,52.5,55,60,60,45,47.5,60,72.5
21
+
data/dfcdata_zh_mc.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,82.39,82.39,90.14,90.14,85.21,85.21,86.62,86.62
3
+ Yi-34B-Chat,86.62,86.62,76.06,85.21,78.87,80.28,85.92,86.62
4
+ DevOps-Model-14B-Chat,80.99,78.87,51.41,63.38,33.80,34.51,54.23,56.34
5
+ LLaMA-2-7B,45.07,45.07,61.97,61.97,30.28,30.28,45.77,45.77
6
+ Qwen-72B-Chat,83.80,83.80,83.80,83.80,86.62,86.62,83.80,83.80
7
+ GPT-3.5-turbo,71.13,73.24,80.28,78.87,77.46,76.06,82.39,81.69
8
+ ERNIE-Bot-4.0,83.00,83.00,85.00,85.00,81.00,81.00,82.00,82.00
9
+ Mistral-7B,16.90,16.90,64.08,64.08,2.82,2.82,64.79,64.79
10
+ LLaMA-2-13B,61.97,61.97,61.27,61.27,45.77,45.77,70.42,70.42
11
+ Baichuan2-13B-Chat,62.68,64.08,68.31,66.20,64.79,66.20,68.31,73.24
12
+ Qwen-14B-Chat,76.06,74.65,69.01,71.83,73.94,73.94,73.24,76.76
13
+ LLaMA-2-70B-Chat,41.55,40.85,72.54,72.54,14.79,14.79,67.61,67.61
14
+ ChatGLM3-6B,51.4084507,51.4084507,57.04225352,57.04225352,55.63380282,55.63380282,61.97183099,61.97183099
15
+ InternLM2-Chat-20B,78.16901408,78.16901408,,,74.64788732,74.64788732,74.64788732,74.64788732
16
+ InternLM2-Chat-7B,74.64788732,74.64788732,57.04225352,57.04225352,76.05633803,76.05633803,73.94366197,73.94366197
17
+ gemma_2b,27.46479,27.46479,41.5493,41.5493 ,28.16901,28.16901,38.02817,38.02817
18
+ gemma_7b,50.70423,50.70423,66.90141,66.90141 ,35.91549,35.91549,59.15493,59.15493
19
+ qwen1.5-14b-base,81.69014,81.69014,57.04225,57.04225 ,73.23944,73.23944,76.05634,76.05634
20
+ qwen1.5-14b-chat,83.80282,80.98592,78.87324,80.98592,75.35211,76.05634,80.28169,83.09859
21
+
data/gtja_zh_mc.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,70.33,70.33,71.43,71.43,68.13,68.13,67.03,67.03
3
+ Yi-34B-Chat,69.23,70.33,49.45,47.25,71.43,74.73,71.43,73.63
4
+ DevOps-Model-14B-Chat,61.54,59.34,52.75,63.74,41.76,38.46,45.05,49.45
5
+ LLaMA-2-7B,42.86,42.86,45.05,45.05,28.57,28.57,45.05,45.05
6
+ Qwen-72B-Chat,70.33,70.33,74.73,74.73,71.43,71.43,67.03,67.03
7
+ GPT-3.5-turbo,47.25,52.75,57.14,58.24,49.45,52.75,59.34,62.64
8
+ ERNIE-Bot-4.0,65.93,65.93,68.13,68.13,68.13,68.13,64.84,64.84
9
+ Mistral-7B,14.29,14.29,38.46,38.46,5.49,5.49,47.25,47.25
10
+ LLaMA-2-13B,47.25,47.25,42.86,42.86,30.77,30.77,47.25,47.25
11
+ Baichuan2-13B-Chat,38.46,38.46,49.45,51.65,41.76,41.76,53.85,60.44
12
+ Qwen-14B-Chat,54.95,54.95,59.34,61.54,47.25,47.25,53.85,54.95
13
+ LLaMA-2-70B-Chat,19.78,19.78,49.45,49.45,6.59,6.59,48.35,48.35
14
+ ChatGLM3-6B,43.95604396,43.95604396,47.25274725,47.25274725,43.95604396,43.95604396,53.84615385,53.84615385
15
+ InternLM2-Chat-20B,65.93406593,65.93406593,,,56.04395604,56.04395604,,
16
+ InternLM2-Chat-7B,54.94505495,54.94505495,51.64835165,51.64835165,56.04395604,56.04395604,59.34065934,59.34065934
17
+ gemma_2b,32.96703,32.96703,29.67033,29.67033,30.76923,30.76923,43.95604,43.95604
18
+ gemma_7b,34.06593,34.06593,50.54945,50.54945,29.67033,29.67033,56.04396,56.04396
19
+ qwen1.5-14b-base,68.13187,68.13187,42.85714,42.85714,53.84615,53.84615,63.73626,63.73626
20
+ qwen1.5-14b-chat,59.34066,57.14286,60.43956,62.63736,56.04396,54.94505,67.03297,68.13187
data/huaweicloud_zh_mc.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,43.33,43.33,46.67,46.67,20.00,20.00,20.00,20.00
3
+ Yi-34B-Chat,50.00,46.67,30.00,43.33,36.67,40.00,36.67,30.00
4
+ DevOps-Model-14B-Chat,40.00,40.00,20.00,23.33,16.67,16.67,33.33,13.33
5
+ LLaMA-2-7B,16.67,16.67,33.33,33.33,10.00,10.00,26.67,26.67
6
+ Qwen-72B-Chat,43.33,43.33,33.33,36.67,36.67,36.67,33.33,33.33
7
+ GPT-3.5-turbo,20.00,20.00,16.67,23.33,13.33,13.33,20.00,26.67
8
+ ERNIE-Bot-4.0,36.67,36.67,23.33,23.33,16.67,16.67,20.00,20.00
9
+ Mistral-7B,0.00,0.00,16.67,16.67,0.00,0.00,23.33,23.33
10
+ LLaMA-2-13B,26.67,26.67,13.33,13.33,10.00,10.00,20.00,20.00
11
+ Baichuan2-13B-Chat,16.67,20.00,26.67,30.00,6.67,10.00,23.33,23.33
12
+ Qwen-14B-Chat,40.00,30.00,26.67,33.33,13.33,13.33,20.00,26.67
13
+ LLaMA-2-70B-Chat,23.33,23.33,16.67,16.67,3.33,3.33,20.00,20.00
14
+ ChatGLM3-6B,6.666666667,6.666666667,13.33333333,13.33333333,13.33333333,13.33333333,16.66666667,16.66666667
15
+ InternLM2-Chat-20B,16.66666667,16.66666667,,,13.33333333,13.33333333,20,20
16
+ InternLM2-Chat-7B,30,30,40,40,43.33333333,43.33333333,23.33333333,23.33333333
17
+ gemma_2b,26.66667,26.66667,20,20 ,26.66667,26.66667,10,10
18
+ gemma_7b,13.33333,13.33333,30,30 ,3.333333,3.333333,23.33333,23.33333
19
+ qwen1.5-14b-base,20,20,30,30 ,20,20,33.33333,33.33333
20
+ qwen1.5-14b-chat,26.66667,26.66667,20,30,23.33333,26.66667,13.33333,13.33333
21
+
data/lenovo_en_mc.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ ChatGLM3-6B,60,60,60,60,55,55,60,60
3
+ InternLM2-Chat-20B,62.5,62.5,,,75,75,,
4
+ InternLM2-Chat-7B,65,65,67.5,67.5,75,75,57.5,57.5
5
+ gemma_2b,22.5,22.5,47.5,47.5 ,30,30,37.5,37.5
6
+ gemma_7b,32.5,32.5,65,65 ,35,35,65,65
7
+ qwen1.5-14b-base,67.5,67.5,70,70 ,72.5,72.5,50,50
8
+ qwen1.5-14b-chat,67.5,67.5,70,70,72.5,65,77.5,77.5
9
+
data/lenovo_zh_mc.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,77.50,77.50,82.50,82.50,77.50,77.50,82.50,82.50
3
+ Yi-34B-Chat,62.50,57.50,52.50,52.50,75.00,75.00,87.50,82.50
4
+ DevOps-Model-14B-Chat,67.50,70.00,62.50,70.00,60.00,67.50,65.00,57.50
5
+ LLaMA-2-7B,60.00,60.00,55.00,55.00,32.50,32.50,45.00,45.00
6
+ Qwen-72B-Chat,75.00,75.00,75.00,75.00,72.50,72.50,75.00,75.00
7
+ GPT-3.5-turbo,57.50,57.50,62.50,62.50,60.00,62.50,65.00,70.00
8
+ ERNIE-Bot-4.0,75.00,75.00,82.50,82.50,75.00,75.00,77.50,77.50
9
+ Mistral-7B,35.00,35.00,60.00,60.00,47.50,47.50,62.50,62.50
10
+ LLaMA-2-13B,60.00,60.00,55.00,55.00,45.00,45.00,62.50,62.50
11
+ ChatGLM3-6B,55.00,55.00,60.00,60.00,60.00,60.00,60.00,60.00
12
+ Baichuan2-13B-Chat,62.50,60.00,70.00,67.50,65.00,60.00,72.50,67.50
13
+ Qwen-14B-Chat,70.00,65.00,65.00,67.50,70.00,67.50,70.00,67.50
14
+ LLaMA-2-70B-Chat,20.00,20.00,57.50,57.50,22.50,22.50,75.00,75.00
15
+
data/network_en_mc.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ Baichuan-13B-Chat,18.3,20.4,28.6,37,24.1,26.7,18.2,17.8
3
+ Chinese-Alpaca-2-13B,37.7,37.7,49.7,49.7,48.6,48.6,50.5,50.5
4
+ GPT-3.5-turbo,66.6,66.8,69.6,72,68.3,68.3,70.9,72.5
5
+ LLaMA-2-13B,41.8,46.5,53.1,58.7,53.3,53,56.8,61
6
+ Qwen-7B-Chat,45.9,46,47.3,50.1,52.1,51,48.3,49.8
7
+ ChatGLM2-6B,24.8,24.7,36.6,36.5,37.6,37.6,40.5,40.5
8
+ Chinese-LLaMA-2-13B,29.4,29.4,37.8,37.8,40.4,40.4,28.8,28.8
9
+ InternLM-7B,38.7,38.7,43.9,43.9,45.2,45.2,51.4,51.4
10
+ LLaMA-2-7B,39.5,40,45.4,49.5,48.2,46.8,52,55.2
11
+ Baichuan2-13B-Chat,14.1,15.3,24.1,25.8,32.3,33.1,25.6,27.7
12
+ GPT-4,/,/,/,/,/,/,88.7,88.7
13
+ AquilaChat2-34B,36.63,36.63,44.83,44.83,46.65,46.65,NULL,NULL
14
+ Yi-34B-Chat,57.75,59.14,65.11,68.79,68.16,68.37,78.09,80.06
15
+ DevOps-Model-14B-Chat,30.69,30.59,55.77,63.63,63.85,61.96,41.15,44.01
16
+ Qwen-72B-Chat,70.41,70.50,72.38,72.56,70.32,70.32,70.13,70.22
17
+ Mistral-7B,29.27,29.27,46.30,46.30,47.22,47.22,45.58,45.58
18
+ Qwen-14B-Chat,43.78,47.81,56.58,59.40,62.09,59.70,49.06,55.88
19
+ LLaMA-2-70B-Chat,25.29,25.29,57.97,58.06,52.97,52.97,58.55,58.55
20
+ ERNIE-Bot-4.0,61.15,61.15,70.00,70.00,60.00,60.00,70.00,70.00
21
+ ChatGLM3-6B,43.38487973,43.38487973,44.58762887,44.58762887,42.09621993,42.09621993,43.47079038,43.47079038
22
+ InternLM2-Chat-20B,56.35738832,56.35738832,26.18025751,26.18025751,60.48109966,60.48109966,45.10309278,45.10309278
23
+ InternLM2-Chat-7B,49.74226804,49.74226804,56.18556701,56.18556701,48.19587629,48.19587629,49.74226804,49.74226804
24
+ gemma_2b,26.46048,26.46048,33.41924,33.41924 ,26.6323,26.6323,37.54296,37.54296
25
+ gemma_7b,25.08591,25.08591,50.85911,50.85911 ,30.24055,30.24055,51.55747,51.55747
26
+ qwen1.5-14b-base,34.87973,34.87973,60.82474,60.82474 ,65.54983,65.54983,47.07904,47.07904
27
+ qwen1.5-14b-chat,54.89691,56.4433,64.08935,67.09622,52.23368,53.52234,59.53608,64.17526
28
+
data/network_en_qa.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,rouge1,rouge2,rouge_l,rouge_lsum,score,bp,sys_len,gpt4_score
2
+ GPT-3.5-turbo,13.38,5.65,12.13,12.26,6.78,1,2966,8.47
3
+ LLaMA-2-70B,8.69,2.51,7.62,7.74,4.2,1,4970,7.28
4
+ LLaMA-2-13B,5.75,1.68,5.03,4.98,3.43,1,8239,7.16
5
+ Chinese-Alpaca-2-13B,3.48,0.96,3.19,3.25,1.85,1,14716,6.66
6
+ Baichuan-13B-Chat,5.58,1.85,4.66,4.76,0.35,1,9577,5.85
7
+ Qwen-7B-Chat,13.03,4.76,11.61,11.82,4.33,1,3091,5.63
8
+ ChatGLM2-6B,10.43,3.24,9.82,9.71,5.07,0.91,2492,4.88
9
+ InternLM-7B,14.34,5.39,13.3,13.27,0.54,1,3112,4.52
10
+ Chinese-LLaMA-2-13B,9.18,2.9,9.22,9.19,0.24,1,32006,2.39
11
+
data/network_zh_mc.csv ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ Baichuan-13B-Chat,15.2,16,43.9,49.7,34.3,36.1,51.3,55.6
3
+ Chinese-Alpaca-2-13B,33.1,33.1,44.2,44.2,44,44,42.7,42.7
4
+ GPT-3.5-turbo,58.4,58.6,64.8,67.6,59.2,59.7,65.2,67.4
5
+ LLaMA-2-13B,29.7,31.6,51.6,57,39.6,38.9,48,50.6
6
+ Qwen-7B-Chat,29.6,29.9,50.6,53.5,50.4,46.9,46.9,47.7
7
+ ChatGLM2-6B,33.8,33.7,42.1,42.2,36,36,39.5,39.5
8
+ Chinese-LLaMA-2-13B,22.5,22.5,38.8,38.8,41.8,41.8,32.2,32.2
9
+ InternLM-7B,41.7,41.7,38.4,38.4,42.6,42.6,41.3,41.3
10
+ LLaMA-2-7B,29.8,30.2,50.1,55.6,38.6,40.8,45.6,50.4
11
+ Baichuan2-13B-Chat,35.6,35.9,30.5,30.5,34.6,35.6,30.2,32.0
12
+ GPT-4,/,/,/,/,/,/,86,86
13
+ AquilaChat2-34B,34.66,34.66,47.74,47.74,44.48,44.48,NULL,NULL
14
+ Yi-34B-Chat,61.61,62.56,68.11,69.75,65.73,65.37,69.88,71.21
15
+ DevOps-Model-14B-Chat,47.59,46.57,52.52,56.01,62.07,60.08,50.59,55.79
16
+ Qwen-72B-Chat,65.77,65.86,68.13,68.30,69.40,69.40,69.99,70.08
17
+ Mistral-7B,1.90,1.90,45.61,45.61,15.00,15.00,35.97,35.97
18
+ Qwen-14B-Chat,48.35,48.81,55.35,57.40,58.53,56.12,52.12,54.99
19
+ LLaMA-2-70B-Chat,38.55,38.55,57.49,57.49,49.09,49.09,48.57,48.57
20
+ ERNIE-Bot-4.0,67.54,67.54,71.96,71.96,72.00,72.00,78.00,78.00
21
+ Hunyuan-13B,60.00,60.00,70.00,70.00,,,,
22
+ ChatGLM3-6B,41.39414802,41.39414802,49.22547332,49.22547332,38.81239243,38.81239243,42.85714286,42.85714286
23
+ InternLM2-Chat-20B,57.48709122,57.48709122,57.14285714,57.14285714,59.1222031,59.1222031,50.77452668,50.77452668
24
+ InternLM2-Chat-7B,54.30292599,54.30292599,59.81067126,59.81067126,58.51979346,58.51979346,51.63511188,51.63511188
25
+ GLM3-turbo,59.63855422,59.63855422,,,,,,
26
+ GLM4,67.383821,67.383821,,,,,,
27
+ gemma_2b,29.69019,29.69019,39.15663,39.15663 ,29.77625,29.77625,38.64028,38.64028
28
+ gemma_7b,31.58348,31.58348,47.59036,47.59036 ,34.68158,34.68158,48.88124,48.88124
29
+ qwen1.5-14b-base,45.18072,45.18072,59.1222,59.1222 ,61.10155,61.10155,52.4957,52.4957
30
+ qwen1.5-14b-chat,54.04475,53.87263,62.56454,63.85542,58.77797,58.0895,63.42513,65.57659
31
+
data/network_zh_qa.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ name,rouge1,rouge2,rouge_l,rouge_lsum,score,bp,sys_len,gpt4_score
2
+ GPT-3.5-turbo,17.28,6.39,16.84,16.87,1.89,0.74,368,6.98
3
+ ChatGLM2-6B,6.92,1.97,6.83,6.75,0.11,1,1867,4.46
4
+ InternLM-7B,2.76,1.03,2.76,2.76,0.01,1,6053,2.22
5
+ Baichuan-13B-Chat,9.09,3.67,9.04,9.2,0.53,1,1125,5.14
6
+ LLaMA-2-13B,4.29,1.29,4.2,4.22,0.23,1,1581,5.03
7
+ Chinese-LLaMA-2-13B,4.96,4.11,4.7,4.73,0.01,1,11371,1.77
8
+ Chinese-Alpaca-2-13B,10.03,2.19,9.86,9.97,0.02,1,2605,4.71
9
+ Qwen-7B-Chat,10,2.45,9.94,10.05,0.23,0.42,257,5.07
10
+
data/oracle_en_mc.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ Baichuan-13B-Chat,12.47,11.67,16.50,19.52,24.55,22.54,26.36,28.77
3
+ Chinese-Alpaca-2-13B,23.14,23.14,28.97,28.97,16.30,16.30,14.29,14.29
4
+ GPT-3.5-turbo,38.63,38.83,40.04,42.05,36.62,37.63,42.66,43.86
5
+ LLaMA-2-13B,16.10,20.32,23.94,29.58,20.12,22.33,24.35,33.80
6
+ Qwen-7B-Chat,18.91,19.11,22.13,23.94,26.76,25.55,34.81,33.40
7
+ ChatGLM2-6B,20.72,20.52,19.92,19.72,20.12,20.12,22.94,22.74
8
+ Chinese-LLaMA-2-13B,13.88,13.88,20.52,20.52,16.90,16.90,23.34,23.34
9
+ InternLM-7B,26.36,26.36,25.55,25.55,25.55,25.55,27.97,27.97
10
+ LLaMA-2-7B,22.13,23.74,23.74,26.56,19.32,20.52,28.77,33.60
11
+ Baichuan2-13B-Chat,17.1,19.1,18.7,22.9,25.9,26.5,20.9,24.5
12
+ GPT-4,/,/,59.02,64.56,/,/,58.35,62.58
13
+ AquilaChat2-34B,36.63,36.63,44.83,44.83,46.65,46.65,NULL,NULL
14
+ Yi-34B-Chat,47.08,48.69,47.08,46.28,58.15,58.35,56.94,58.95
15
+ DevOps-Model-14B-Chat,25.15,26.96,35.41,38.83,33.20,34.81,27.36,27.36
16
+ Qwen-72B-Chat,47.28,47.48,48.09,48.09,49.70,49.70,43.46,43.66
17
+ ERNIE-Bot-4.0,43.80,43.80,47.14,47.14,46.00,46.00,54.00,54.00
18
+ Mistral-7B,17.10,17.10,26.76,26.76,31.19,31.19,27.97,27.97
19
+ Qwen-14B-Chat,24.95,28.37,33.00,36.62,27.97,28.37,27.97,24.14
20
+ LLaMA-2-70B-Chat,19.72,19.72,27.97,27.97,26.56,26.56,32.60,32.60
21
+ ChatGLM3-6B,20.92555332,20.92555332,25.15090543,25.15090543,24.74849095,24.74849095,29.1750503,29.1750503
22
+ InternLM2-Chat-20B,,,59.21052632,59.21052632,,,,
23
+ InternLM2-Chat-7B,27.16297787,27.16297787,28.16901408,28.16901408,29.97987928,29.97987928,30.18108652,30.18108652
24
+ gemma_2b,16.90141,16.90141,19.5171,19.5171 ,16.09658,16.09658,24.74849,24.74849
25
+ gemma_7b,14.28571,14.28571,30.98592,30.98592 ,2.60223,2.60223,43.85965,43.85965
26
+ qwen1.5-14b-base,29.17505,29.17505,33.60161,33.60161 ,36.82093,36.82093,27.7666,27.7666
27
+ qwen1.5-14b-chat,32.79678,35.41247,39.43662,43.05835,32.39437,33.60161,36.82093,38.833
28
+
data/oracle_zh_mc.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ Baichuan-13B-Chat,12.88,12.07,25.96,27.57,18.91,19.52,27.97,30.58
3
+ Chinese-Alpaca-2-13B,22.94,22.94,25.75,25.75,25.15,25.15,22.33,22.33
4
+ GPT-3.5-turbo,36.42,35.81,39.24,43.26,39.84,39.44,27.16,27.77
5
+ LLaMA-2-13B,23.94,24.35,29.58,31.99,24.55,26.76,21.13,20.72
6
+ Qwen-7B-Chat,18.51,17.71,27.36,28.37,29.78,29.58,33.60,31.79
7
+ ChatGLM2-6B,23.34,23.34,24.35,24.14,22.94,22.94,26.16,26.16
8
+ Chinese-LLaMA-2-13B,14.69,14.69,19.92,19.92,19.72,19.72,20.93,20.93
9
+ InternLM-7B,25.96,25.96,25.96,25.96,29.18,29.18,28.37,28.37
10
+ LLaMA-2-7B,20.72,20.72,27.16,27.97,21.53,18.51,18.31,17.91
11
+ Baichuan2-13B-Chat,25.7,25.5,20.1,21.3,27.7,26.7,22.7,24.7
12
+ GPT-4,/,/,59.38,65.17,/,/,44.06,48.09
13
+ AquilaChat2-34B,34.66,34.66,47.74,47.74,44.48,44.48,NULL,NULL
14
+ Mistral-7B,1.90,1.90,45.61,45.61,15.00,15.00,35.97,35.97
15
+ Yi-34B-Chat,49.90,49.30,52.72,53.72,56.34,56.34,51.31,54.33
16
+ DevOps-Model-14B-Chat,24.75,22.74,28.37,27.77,36.62,37.02,27.57,26.36
17
+ Qwen-72B-Chat,48.29,48.49,49.50,49.70,49.70,49.70,45.27,44.87
18
+ ERNIE-Bot-4.0,48.56,48.56,50.64,50.64,48.00,48.00,54.00,54.00
19
+ Mistral-7B,0.20,0.20,26.76,26.76,10.26,10.26,32.19,32.19
20
+ Qwen-14B-Chat,27.57,27.57,32.39,36.02,40.04,35.41,30.38,33.40
21
+ LLaMA-2-70B-Chat,15.29,15.29,34.81,34.81,26.76,26.76,33.80,33.80
22
+ ChatGLM3-6B,21.32796781,21.32796781,28.97384306,28.97384306,21.73038229,21.73038229,29.57746479,29.57746479
23
+ InternLM2-Chat-7B,28.57142857,28.57142857,31.79074447,31.79074447,30.78470825,30.78470825,31.18712274,31.18712274
24
+ gemma_2b,18.51107,18.51107,24.9497,24.9497 ,21.52918,21.52918,27.7666,27.7666
25
+ gemma_7b,19.3159,19.3159,53.94737,53.94737 ,18.51107,18.51107,5.204461,5.204461
26
+ qwen1.5-14b-base,20.92555,20.92555,35.61368,35.61368 ,41.44869,41.44869,30.78471,30.78471
27
+ qwen1.5-14b-chat,24.14487,23.34004,40.64386,41.04628,38.22938,38.02817,39.43662,40.04024
28
+
data/pufa_zh_mc.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,84.00,84.00,90.67,90.67,88.00,88.00,86.67,86.67
3
+ Yi-34B-Chat,90.67,92.00,78.67,89.33,84.00,84.00,88.00,88.00
4
+ DevOps-Model-14B-Chat,82.67,81.33,53.33,70.67,29.33,29.33,62.67,61.33
5
+ LLaMA-2-7B,48.00,48.00,52.00,52.00,25.33,25.33,40.00,40.00
6
+ Qwen-72B-Chat,88.00,88.00,82.67,82.67,90.67,90.67,85.33,85.33
7
+ GPT-3.5-turbo,76.00,78.67,84.00,82.67,77.33,77.33,84.00,81.33
8
+ ERNIE-Bot-4.0,82.67,82.67,86.67,86.67,86.67,86.67,86.67,86.67
9
+ Mistral-7B,22.67,22.67,54.67,54.67,4.00,4.00,58.67,58.67
10
+ LLaMA-2-13B,61.33,61.33,53.33,53.33,44.00,44.00,68.00,68.00
11
+ Baichuan2-13B-Chat,62.67,61.33,62.67,62.67,65.33,66.67,66.67,66.67
12
+ Qwen-14B-Chat,73.33,73.33,72.00,80.00,73.33,73.33,69.33,72.00
13
+ LLaMA-2-70B-Chat,49.33,49.33,66.67,66.67,6.67,6.67,65.33,65.33
14
+ ChatGLM3-6B,56,56,58.66666667,58.66666667,60,60,61.33333333,61.33333333
15
+ InternLM2-Chat-20B,80,80,,,76,76,80,80
16
+ InternLM2-Chat-7B,72,72,53.33333333,53.33333333,78.66666667,78.66666667,72,72
17
+ gemma_2b,36,36,30.66667,30.66667,36,36,41.33333,41.33333
18
+ gemma_7b,46.66667,46.66667,56,56,34.66667,34.66667,56,56
19
+ qwen1.5-14b-base,92,92,42.66667,42.66667,78.66667,78.66667,72,72
20
+ qwen1.5-14b-chat,78.66667,80,86.66667,85.33333,86.66667,89.33333,85.33333,85.33333
21
+
data/rzy_zh_mc.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,65.28,65.28,68.19,68.19,65.56,65.56,68.05,68.05
3
+ Yi-34B-Chat,60.17,60.03,57.68,57.54,64.45,64.59,67.77,67.36
4
+ DevOps-Model-14B-Chat,65.28,64.18,55.19,61.83,53.67,56.85,54.50,59.20
5
+ LLaMA-2-7B,46.20,46.20,53.39,53.39,34.85,34.85,44.95,44.95
6
+ Qwen-72B-Chat,65.98,65.98,70.12,70.12,66.67,66.67,65.28,65.28
7
+ GPT-3.5-turbo,65.28,66.25,68.05,68.74,65.28,65.42,66.39,67.50
8
+ ERNIE-Bot-4.0,73.00,73.00,77.00,77.00,76.00,76.00,79.00,79.00
9
+ Mistral-7B,29.88,29.88,59.75,59.75,18.53,18.53,60.30,60.30
10
+ LLaMA-2-13B,57.12,57.12,53.39,53.39,51.18,51.18,59.06,59.06
11
+ Baichuan2-13B-Chat,59.06,59.34,64.45,64.32,60.17,60.17,62.79,67.50
12
+ Qwen-14B-Chat,65.28,63.49,62.93,65.98,61.96,61.55,61.55,64.45
13
+ LLaMA-2-70B-Chat,48.82,48.82,59.75,59.75,5.26,5.26,62.52,62.52
14
+ ChatGLM3-6B,55.32503458,55.32503458,59.33609959,59.33609959,54.21853389,54.21853389,62.10235131,62.10235131
15
+ InternLM2-Chat-20B,,,,,63.90041494,63.90041494,64.03872752,64.03872752
16
+ InternLM2-Chat-7B,65.00691563,65.00691563,54.21853389,54.21853389,61.2724758,61.2724758,63.62378976,63.62378976
17
+ gemma_2b,33.60996,33.60996,37.75934,37.75934 ,36.37621,36.37621,45.22822,45.22822
18
+ gemma_7b,42.04703,42.04703,56.70816,56.70816 ,39.41909,39.41909,54.77178,54.77178
19
+ qwen1.5-14b-base,65.42185,65.42185,50.89903,50.89903 ,51.17566,51.17566,62.6556,62.6556
20
+ qwen1.5-14b-chat,63.34716,63.7621,65.42185,65.9751,62.93223,64.03873,64.59198,64.31535
21
+
data/tencent_zh_qa.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,gpt4_score
2
+ Baichuan2-13B-Chat,8.727272727272727
3
+ DevOps-Model-14B-Chat,8.25974026
4
+ LLaMA-2-13B,7.636363636363637
5
+ LLaMA-2-70B-Chat,7.740259740259741
6
+ Mistral-7B,7.8441558441558445
7
+ Qwen-14B-Chat,8.642857142857142
8
+ Qwen-72B-Chat,8.811688311688311
9
+ GPT4,9.019480519480519
10
+ Yi-34B-Chat,8.844155844155845
11
+ ChatGLM3-6B,8.577922077922079
12
+ LLaMA-2-7B,5.318181818181818
13
+ GPT-3.5-turbo,8.850649351
14
+
data/zabbix_zh_mc.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,few_native,few_self_con,few_cot,few_cot_self_con,zero_native,zero_self_con,zero_cot,zero_cot_self_con
2
+ GPT4,60.00,60.00,59.00,59.00,51.00,51.00,53.00,53.00
3
+ Yi-34B-Chat,42.00,42.00,42.00,42.00,40.00,40.00,40.00,40.00
4
+ DevOps-Model-14B-Chat,46.00,44.00,44.00,46.00,27.00,28.00,36.00,33.00
5
+ LLaMA-2-7B,22.00,22.00,28.00,28.00,18.00,18.00,35.00,35.00
6
+ Qwen-72B-Chat,45.00,45.00,61.00,61.00,46.00,46.00,44.00,44.00
7
+ GPT-3.5-turbo,40.00,40.00,48.00,48.00,36.00,36.00,42.00,42.00
8
+ ERNIE-Bot-4.0,47.00,47.00,51.00,51.00,44.00,44.00,48.00,48.00
9
+ Mistral-7B,11.00,11.00,44.00,44.00,6.00,6.00,42.00,42.00
10
+ LLaMA-2-13B,40.00,40.00,43.00,43.00,28.00,28.00,45.00,45.00
11
+ Baichuan2-13B-Chat,29.00,27.00,40.00,43.00,31.00,29.00,47.00,47.00
12
+ Qwen-14B-Chat,44.00,40.00,47.00,43.00,36.00,36.00,39.00,41.00
13
+ LLaMA-2-70B-Chat,29.00,29.00,46.00,46.00,1.00,1.00,47.00,47.00
14
+ ChatGLM3-6B,29,29,34,34,29,29,36,36
15
+ InternLM2-Chat-20B,44,44,,,41,41,,
16
+ InternLM2-Chat-7B,45,45,35,35,43,43,39,39
17
+ gemma_2b,24,24,30,30,25,25,32,32
18
+ gemma_7b,28,28,40,40 ,22,22,44,44
19
+ qwen1.5-14b-base,48,48,36,36,38,38,39,39
20
+ qwen1.5-14b-chat,42,39,48,49,34,34,45,43
21
+
data/zjyd_zh_mc.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ Baichuan-13B-Chat,11.04,11.13,26.92,28.61,14.35,13.22,31.69,33.97
3
+ Chinese-Alpaca-2-13B,22.69,22.69,24.59,24.59,40.52,40.52,40.73,40.73
4
+ GPT-3.5-turbo,37.06,36.83,37.56,39.25,39.42,39.77,41.96,42.15
5
+ LLaMA-2-13B,25.43,27.16,29.17,29.99,36.56,36.15,37.70,39.02
6
+ Qwen-7B-Chat,36.28,36.50,33.18,33.51,41.58,40.59,31.48,31.46
7
+ ChatGLM2-6B,23.09,23.12,24.22,24.08,30.46,30.46,35.97,35.90
8
+ Chinese-LLaMA-2-13B,17.98,17.98,17.83,17.83,31.66,31.66,36.24,36.24
9
+ InternLM-7B,27.81,27.81,19.95,19.95,24.18,24.18,35.35,35.35
10
+ LLaMA-2-7B,24.09,23.47,28.69,29.26,29.94,30.03,31.35,31.93
11
+ GPT-4,/,/,57.35,62.11,/,/,61.20,65.68
12
+ Yi-34B-Chat,64.91,64.58,62.77,65.51,70.85,70.92,48.77,47.97
13
+ DevOps-Model-14B-Chat,41.04,42.70,48.71,53.57,56.85,57.25,51.30,54.29
14
+ Qwen-72B-Chat,64.79,64.79,65.79,65.72,70.19,70.19,68.31,68.38
15
+ ERNIE-Bot-4.0,45.99,45.99,48.98,48.98,46.00,46.00,54.00,54.00
16
+ Mistral-7B,1.27,1.27,42.05,42.05,30.72,30.72,46.44,46.44
17
+ Qwen-14B-Chat,41.71,41.44,45.58,47.98,53.52,49.92,54.72,58.85
18
+ LLaMA-2-70B-Chat,24.38,24.38,43.63,43.63,44.65,44.65,48.84,48.84
19
+ ChatGLM3-6B,32.6,32.6,35.4,35.4,28.3,28.3,40.9,40.9
20
+ InternLM2-Chat-20B,44.6,44.6,47,47,62.2,62.2,38.3,38.3
21
+ InternLM2-Chat-7B,38.8,38.8,44.6,44.6,46,46,35.8,35.8
22
+ GLM3-turbo,43,43,,,,,,
23
+ GLM4,50,50,,,,,,
24
+ gemma_2b,25.6,25.6,28.3,28.3 ,19.1,19.1,35.5,35.5
25
+ gemma_7b,27.3,27.3,35.4,35.4 ,17.3,17.3,44.5,44.5
26
+ qwen1.5-14b-base,49.1,49.1,49.9,49.9 ,62.5,62.5,41.3,41.3
27
+ qwen1.5-14b-chat,38.6,38.9,48.8,50.5,54.6,55.2,52.1,52.7
28
+
data/zjyd_zh_qa.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ GPT-3.5-turbo,79.48718,81.19658,88.39286,89.28571 ,79.48717949,81.1965812,86.60714286,88.39285714
3
+ Gemma-2B,26.49573,26.49573,62.5,62.5 ,58.97435897,58.97435897,75,75
4
+ Gemma-7B,77.77778,77.77778,75.89286,75.89286 ,76.06837607,76.06837607,86.60714286,86.60714286
5
+ Qwen1.5-0.5B-Base,65.17857,65.17857,75,75 ,56.25,56.25,57.14285714,57.14285714
6
+ Qwen1.5-0.5B-Chat,0,0,54.46429,53.57143 ,20.53571429,19.64285714,16.96428571,17.85714286
7
+ Qwen1.5-1.8B-Base,71.42857,71.42857,71.42857,71.42857 ,70.53571429,70.53571429,80.35714286,80.35714286
8
+ Qwen1.5-1.8B-Chat,73.21429,69.64286,67.85714,77.67857 ,66.07142857,66.07142857,68.75,75
9
+ Qwen1.5-14B-Base,76.92308,76.92308,88.39286,88.39286 ,78.63247863,78.63247863,83.03571429,83.03571429
10
+ Qwen1.5-14B-Chat,18.75,23.21429,91.07143,92.85714 ,79.46428571,78.57142857,80.35714286,83.92857143
11
+
data/zte_en_mc.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ Baichuan-13B-Chat,11.60,14.31,14.68,18.46,14.56,15.68,16.21,16.82
3
+ Chinese-Alpaca-2-13B,20.86,20.86,23.08,23.08,29.75,29.75,32.83,32.83
4
+ GPT-3.5-turbo,35.04,34.82,38.46,43.50,39.29,39.19,41.01,42.58
5
+ LLaMA-2-13B,15.62,18.32,29.88,34.45,23.16,29.14,37.59,44.30
6
+ Qwen-7B-Chat,33.37,33.74,32.97,34.10,32.98,32.70,36.60,36.65
7
+ ChatGLM2-6B,15.94,16.06,19.83,19.91,26.27,26.22,28.25,28.37
8
+ Chinese-LLaMA-2-13B,10.02,10.02,19.51,19.51,34.51,34.51,33.34,33.34
9
+ InternLM-7B,20.48,20.48,23.85,23.85,23.69,23.69,26.06,26.06
10
+ LLaMA-2-7B,19.42,21.62,25.46,27.11,21.45,24.85,33.60,34.83
11
+ GPT-4,/,/,56.90,65.49,/,/,59.39,63.54
12
+ Yi-34B-Chat,38.24,37.04,48.24,52.10,61.33,61.19,53.53,53.39
13
+ DevOps-Model-14B-Chat,31.04,30.51,42.84,47.37,52.25,49.38,45.90,47.23
14
+ Qwen-72B-Chat,53.19,53.19,55.25,55.52,58.13,58.13,58.72,58.99
15
+ ERNIE-Bot-4.0,43.66,43.66,51.99,51.99,44.00,44.00,50.00,50.00
16
+ Mistral-7B,26.91,26.91,30.65,30.65,40.52,40.52,46.84,46.84
17
+ Qwen-14B-Chat,33.71,36.25,41.24,42.51,51.19,50.39,57.18,59.18
18
+ LLaMA-2-70B-Chat,23.64,23.64,39.31,39.31,38.98,39.12,47.90,47.90
19
+ ChatGLM3-6B,30.4,30.4,30.7,30.7,26.9,26.9,37.2,37.2
20
+ InternLM2-Chat-20B,39.1,39.1,37.7,37.7,47.7,47.7,33.5,33.5
21
+ InternLM2-Chat-7B,36.8,36.8,31.7,31.7,46.3,46.3,36.9,36.9
22
+ gemma_2b,20.1,20.1,24.2,24.2 ,31.2,31.2,35.5,35.5
23
+ gemma_7b,23.1,23.1,34.4,34.4 ,21.4,21.4,33.1,33.1
24
+ qwen1.5-14b-base,34,34,42.8,42.8 ,57.9,57.9,40.2,40.2
25
+ qwen1.5-14b-chat,34.5,35.6,41.7,41.1,33.2,34.7,46.2,47.4
26
+
data/zte_zh_mc.csv ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,zero_native,zero_self_con,zero_cot,zero_cot_self_con,few_native,few_self_con,few_cot,few_cot_self_con
2
+ Yi-34B-Chat,64.91,64.58,62.77,65.51,70.85,70.92,48.77,47.97
3
+ Qwen-72B-Chat,64.79,64.79,65.79,65.72,70.19,70.19,68.31,68.38
4
+ GPT-4,/,/,57.35,62.11,/,/,61.2,65.68
5
+ qwen1.5-14b-base,49.1,49.1,49.9,49.9,62.5,62.5,41.3,41.3
6
+ InternLM2-Chat-20B,44.6,44.6,47,47,62.2,62.2,38.3,38.3
7
+ Qwen-14B-Chat,41.71,41.44,45.58,47.98,53.52,49.92,54.72,58.85
8
+ DevOps-Model-14B-Chat,41.04,42.7,48.71,53.57,56.85,57.25,51.3,54.29
9
+ qwen1.5-14b-chat,38.6,38.9,48.8,50.5,54.6,55.2,52.1,52.7
10
+ ERNIE-Bot-4.0,45.99,45.99,48.98,48.98,46,46,54,54
11
+ GLM4,50,50,,,,,,
12
+ LLaMA-2-70B-Chat,24.38,24.38,43.63,43.63,44.65,44.65,48.84,48.84
13
+ Mistral-7B,1.27,1.27,42.05,42.05,30.72,30.72,46.44,46.44
14
+ InternLM2-Chat-7B,38.8,38.8,44.6,44.6,46,46,35.8,35.8
15
+ gemma_7b,27.3,27.3,35.4,35.4,17.3,17.3,44.5,44.5
16
+ GLM3-turbo,43,43,,,,,,
17
+ GPT-3.5-turbo,37.06,36.83,37.56,39.25,39.42,39.77,41.96,42.15
18
+ Qwen-7B-Chat,36.28,36.5,33.18,33.51,41.58,40.59,31.48,31.46
19
+ ChatGLM3-6B,32.6,32.6,35.4,35.4,28.3,28.3,40.9,40.9
20
+ Chinese-Alpaca-2-13B,22.69,22.69,24.59,24.59,40.52,40.52,40.73,40.73
21
+ LLaMA-2-13B,25.43,27.16,29.17,29.99,36.56,36.15,37.7,39.02
22
+ Chinese-LLaMA-2-13B,17.98,17.98,17.83,17.83,31.66,31.66,36.24,36.24
23
+ ChatGLM2-6B,23.09,23.12,24.22,24.08,30.46,30.46,35.97,35.9
24
+ gemma_2b,25.6,25.6,28.3,28.3,19.1,19.1,35.5,35.5
25
+ InternLM-7B,27.81,27.81,19.95,19.95,24.18,24.18,35.35,35.35
26
+ Baichuan-13B-Chat,11.04,11.13,26.92,28.61,14.35,13.22,31.69,33.97
27
+ LLaMA-2-7B,24.09,23.47,28.69,29.26,29.94,30.03,31.35,31.93
28
+
df_process.ipynb ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import os, json"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/html": [
21
+ "<div>\n",
22
+ "<style scoped>\n",
23
+ " .dataframe tbody tr th:only-of-type {\n",
24
+ " vertical-align: middle;\n",
25
+ " }\n",
26
+ "\n",
27
+ " .dataframe tbody tr th {\n",
28
+ " vertical-align: top;\n",
29
+ " }\n",
30
+ "\n",
31
+ " .dataframe thead th {\n",
32
+ " text-align: right;\n",
33
+ " }\n",
34
+ "</style>\n",
35
+ "<table border=\"1\" class=\"dataframe\">\n",
36
+ " <thead>\n",
37
+ " <tr style=\"text-align: right;\">\n",
38
+ " <th></th>\n",
39
+ " <th>name</th>\n",
40
+ " <th>zero_native</th>\n",
41
+ " <th>zero_self_con</th>\n",
42
+ " <th>zero_cot</th>\n",
43
+ " <th>zero_cot_self_con</th>\n",
44
+ " <th>few_native</th>\n",
45
+ " <th>few_self_con</th>\n",
46
+ " <th>few_cot</th>\n",
47
+ " <th>few_cot_self_con</th>\n",
48
+ " </tr>\n",
49
+ " </thead>\n",
50
+ " <tbody>\n",
51
+ " <tr>\n",
52
+ " <th>0</th>\n",
53
+ " <td>Baichuan-13B-Chat</td>\n",
54
+ " <td>18.3</td>\n",
55
+ " <td>20.4</td>\n",
56
+ " <td>28.6</td>\n",
57
+ " <td>37</td>\n",
58
+ " <td>24.1</td>\n",
59
+ " <td>26.7</td>\n",
60
+ " <td>18.200000</td>\n",
61
+ " <td>17.800000</td>\n",
62
+ " </tr>\n",
63
+ " <tr>\n",
64
+ " <th>1</th>\n",
65
+ " <td>Chinese-Alpaca-2-13B</td>\n",
66
+ " <td>37.7</td>\n",
67
+ " <td>37.7</td>\n",
68
+ " <td>49.7</td>\n",
69
+ " <td>49.7</td>\n",
70
+ " <td>48.6</td>\n",
71
+ " <td>48.6</td>\n",
72
+ " <td>50.500000</td>\n",
73
+ " <td>50.500000</td>\n",
74
+ " </tr>\n",
75
+ " <tr>\n",
76
+ " <th>2</th>\n",
77
+ " <td>GPT-3.5-turbo</td>\n",
78
+ " <td>66.6</td>\n",
79
+ " <td>66.8</td>\n",
80
+ " <td>69.6</td>\n",
81
+ " <td>72</td>\n",
82
+ " <td>68.3</td>\n",
83
+ " <td>68.3</td>\n",
84
+ " <td>70.900000</td>\n",
85
+ " <td>72.500000</td>\n",
86
+ " </tr>\n",
87
+ " <tr>\n",
88
+ " <th>3</th>\n",
89
+ " <td>LLaMA-2-13B</td>\n",
90
+ " <td>41.8</td>\n",
91
+ " <td>46.5</td>\n",
92
+ " <td>53.1</td>\n",
93
+ " <td>58.7</td>\n",
94
+ " <td>53.3</td>\n",
95
+ " <td>53</td>\n",
96
+ " <td>56.800000</td>\n",
97
+ " <td>61.000000</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>4</th>\n",
101
+ " <td>Qwen-7B-Chat</td>\n",
102
+ " <td>45.9</td>\n",
103
+ " <td>46</td>\n",
104
+ " <td>47.3</td>\n",
105
+ " <td>50.1</td>\n",
106
+ " <td>52.1</td>\n",
107
+ " <td>51</td>\n",
108
+ " <td>48.300000</td>\n",
109
+ " <td>49.800000</td>\n",
110
+ " </tr>\n",
111
+ " <tr>\n",
112
+ " <th>5</th>\n",
113
+ " <td>ChatGLM2-6B</td>\n",
114
+ " <td>24.8</td>\n",
115
+ " <td>24.7</td>\n",
116
+ " <td>36.6</td>\n",
117
+ " <td>36.5</td>\n",
118
+ " <td>37.6</td>\n",
119
+ " <td>37.6</td>\n",
120
+ " <td>40.500000</td>\n",
121
+ " <td>40.500000</td>\n",
122
+ " </tr>\n",
123
+ " <tr>\n",
124
+ " <th>6</th>\n",
125
+ " <td>Chinese-LLaMA-2-13B</td>\n",
126
+ " <td>29.4</td>\n",
127
+ " <td>29.4</td>\n",
128
+ " <td>37.8</td>\n",
129
+ " <td>37.8</td>\n",
130
+ " <td>40.4</td>\n",
131
+ " <td>40.4</td>\n",
132
+ " <td>28.800000</td>\n",
133
+ " <td>28.800000</td>\n",
134
+ " </tr>\n",
135
+ " <tr>\n",
136
+ " <th>7</th>\n",
137
+ " <td>InternLM-7B</td>\n",
138
+ " <td>38.7</td>\n",
139
+ " <td>38.7</td>\n",
140
+ " <td>43.9</td>\n",
141
+ " <td>43.9</td>\n",
142
+ " <td>45.2</td>\n",
143
+ " <td>45.2</td>\n",
144
+ " <td>51.400000</td>\n",
145
+ " <td>51.400000</td>\n",
146
+ " </tr>\n",
147
+ " <tr>\n",
148
+ " <th>8</th>\n",
149
+ " <td>LLaMA-2-7B</td>\n",
150
+ " <td>39.5</td>\n",
151
+ " <td>40</td>\n",
152
+ " <td>45.4</td>\n",
153
+ " <td>49.5</td>\n",
154
+ " <td>48.2</td>\n",
155
+ " <td>46.8</td>\n",
156
+ " <td>52.000000</td>\n",
157
+ " <td>55.200000</td>\n",
158
+ " </tr>\n",
159
+ " <tr>\n",
160
+ " <th>9</th>\n",
161
+ " <td>Baichuan2-13B-Chat</td>\n",
162
+ " <td>14.1</td>\n",
163
+ " <td>15.3</td>\n",
164
+ " <td>24.1</td>\n",
165
+ " <td>25.8</td>\n",
166
+ " <td>32.3</td>\n",
167
+ " <td>33.1</td>\n",
168
+ " <td>25.600000</td>\n",
169
+ " <td>27.700000</td>\n",
170
+ " </tr>\n",
171
+ " <tr>\n",
172
+ " <th>10</th>\n",
173
+ " <td>GPT-4</td>\n",
174
+ " <td>/</td>\n",
175
+ " <td>/</td>\n",
176
+ " <td>/</td>\n",
177
+ " <td>/</td>\n",
178
+ " <td>/</td>\n",
179
+ " <td>/</td>\n",
180
+ " <td>88.700000</td>\n",
181
+ " <td>88.700000</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>11</th>\n",
185
+ " <td>AquilaChat2-34B</td>\n",
186
+ " <td>36.63</td>\n",
187
+ " <td>36.63</td>\n",
188
+ " <td>44.83</td>\n",
189
+ " <td>44.83</td>\n",
190
+ " <td>46.65</td>\n",
191
+ " <td>46.65</td>\n",
192
+ " <td>NaN</td>\n",
193
+ " <td>NaN</td>\n",
194
+ " </tr>\n",
195
+ " <tr>\n",
196
+ " <th>12</th>\n",
197
+ " <td>Yi-34B-Chat</td>\n",
198
+ " <td>57.75</td>\n",
199
+ " <td>59.14</td>\n",
200
+ " <td>65.11</td>\n",
201
+ " <td>68.79</td>\n",
202
+ " <td>68.16</td>\n",
203
+ " <td>68.37</td>\n",
204
+ " <td>78.090000</td>\n",
205
+ " <td>80.060000</td>\n",
206
+ " </tr>\n",
207
+ " <tr>\n",
208
+ " <th>13</th>\n",
209
+ " <td>DevOps-Model-14B-Chat</td>\n",
210
+ " <td>30.69</td>\n",
211
+ " <td>30.59</td>\n",
212
+ " <td>55.77</td>\n",
213
+ " <td>63.63</td>\n",
214
+ " <td>63.85</td>\n",
215
+ " <td>61.96</td>\n",
216
+ " <td>41.150000</td>\n",
217
+ " <td>44.010000</td>\n",
218
+ " </tr>\n",
219
+ " <tr>\n",
220
+ " <th>14</th>\n",
221
+ " <td>Qwen-72B-Chat</td>\n",
222
+ " <td>70.41</td>\n",
223
+ " <td>70.50</td>\n",
224
+ " <td>72.38</td>\n",
225
+ " <td>72.56</td>\n",
226
+ " <td>70.32</td>\n",
227
+ " <td>70.32</td>\n",
228
+ " <td>70.130000</td>\n",
229
+ " <td>70.220000</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>15</th>\n",
233
+ " <td>Mistral-7B</td>\n",
234
+ " <td>29.27</td>\n",
235
+ " <td>29.27</td>\n",
236
+ " <td>46.30</td>\n",
237
+ " <td>46.30</td>\n",
238
+ " <td>47.22</td>\n",
239
+ " <td>47.22</td>\n",
240
+ " <td>45.580000</td>\n",
241
+ " <td>45.580000</td>\n",
242
+ " </tr>\n",
243
+ " <tr>\n",
244
+ " <th>16</th>\n",
245
+ " <td>Qwen-14B-Chat</td>\n",
246
+ " <td>43.78</td>\n",
247
+ " <td>47.81</td>\n",
248
+ " <td>56.58</td>\n",
249
+ " <td>59.40</td>\n",
250
+ " <td>62.09</td>\n",
251
+ " <td>59.70</td>\n",
252
+ " <td>49.060000</td>\n",
253
+ " <td>55.880000</td>\n",
254
+ " </tr>\n",
255
+ " <tr>\n",
256
+ " <th>17</th>\n",
257
+ " <td>LLaMA-2-70B-Chat</td>\n",
258
+ " <td>25.29</td>\n",
259
+ " <td>25.29</td>\n",
260
+ " <td>57.97</td>\n",
261
+ " <td>58.06</td>\n",
262
+ " <td>52.97</td>\n",
263
+ " <td>52.97</td>\n",
264
+ " <td>58.550000</td>\n",
265
+ " <td>58.550000</td>\n",
266
+ " </tr>\n",
267
+ " <tr>\n",
268
+ " <th>18</th>\n",
269
+ " <td>ERNIE-Bot-4.0</td>\n",
270
+ " <td>61.15</td>\n",
271
+ " <td>61.15</td>\n",
272
+ " <td>70.00</td>\n",
273
+ " <td>70.00</td>\n",
274
+ " <td>60.00</td>\n",
275
+ " <td>60.00</td>\n",
276
+ " <td>70.000000</td>\n",
277
+ " <td>70.000000</td>\n",
278
+ " </tr>\n",
279
+ " <tr>\n",
280
+ " <th>19</th>\n",
281
+ " <td>ChatGLM3-6B</td>\n",
282
+ " <td>43.38487973</td>\n",
283
+ " <td>43.38487973</td>\n",
284
+ " <td>44.58762887</td>\n",
285
+ " <td>44.58762887</td>\n",
286
+ " <td>42.09621993</td>\n",
287
+ " <td>42.09621993</td>\n",
288
+ " <td>43.470790</td>\n",
289
+ " <td>43.470790</td>\n",
290
+ " </tr>\n",
291
+ " <tr>\n",
292
+ " <th>20</th>\n",
293
+ " <td>InternLM2-Chat-20B</td>\n",
294
+ " <td>56.35738832</td>\n",
295
+ " <td>56.35738832</td>\n",
296
+ " <td>26.18025751</td>\n",
297
+ " <td>26.18025751</td>\n",
298
+ " <td>60.48109966</td>\n",
299
+ " <td>60.48109966</td>\n",
300
+ " <td>45.103093</td>\n",
301
+ " <td>45.103093</td>\n",
302
+ " </tr>\n",
303
+ " <tr>\n",
304
+ " <th>21</th>\n",
305
+ " <td>InternLM2-Chat-7B</td>\n",
306
+ " <td>49.74226804</td>\n",
307
+ " <td>49.74226804</td>\n",
308
+ " <td>56.18556701</td>\n",
309
+ " <td>56.18556701</td>\n",
310
+ " <td>48.19587629</td>\n",
311
+ " <td>48.19587629</td>\n",
312
+ " <td>49.742268</td>\n",
313
+ " <td>49.742268</td>\n",
314
+ " </tr>\n",
315
+ " <tr>\n",
316
+ " <th>22</th>\n",
317
+ " <td>gemma_2b</td>\n",
318
+ " <td>26.46048</td>\n",
319
+ " <td>26.46048</td>\n",
320
+ " <td>33.41924</td>\n",
321
+ " <td>33.41924</td>\n",
322
+ " <td>26.6323</td>\n",
323
+ " <td>26.6323</td>\n",
324
+ " <td>37.542960</td>\n",
325
+ " <td>37.542960</td>\n",
326
+ " </tr>\n",
327
+ " <tr>\n",
328
+ " <th>23</th>\n",
329
+ " <td>gemma_7b</td>\n",
330
+ " <td>25.08591</td>\n",
331
+ " <td>25.08591</td>\n",
332
+ " <td>50.85911</td>\n",
333
+ " <td>50.85911</td>\n",
334
+ " <td>30.24055</td>\n",
335
+ " <td>30.24055</td>\n",
336
+ " <td>51.557470</td>\n",
337
+ " <td>51.557470</td>\n",
338
+ " </tr>\n",
339
+ " <tr>\n",
340
+ " <th>24</th>\n",
341
+ " <td>qwen1.5-14b-base</td>\n",
342
+ " <td>34.87973</td>\n",
343
+ " <td>34.87973</td>\n",
344
+ " <td>60.82474</td>\n",
345
+ " <td>60.82474</td>\n",
346
+ " <td>65.54983</td>\n",
347
+ " <td>65.54983</td>\n",
348
+ " <td>47.079040</td>\n",
349
+ " <td>47.079040</td>\n",
350
+ " </tr>\n",
351
+ " <tr>\n",
352
+ " <th>25</th>\n",
353
+ " <td>qwen1.5-14b-chat</td>\n",
354
+ " <td>54.89691</td>\n",
355
+ " <td>56.4433</td>\n",
356
+ " <td>64.08935</td>\n",
357
+ " <td>67.09622</td>\n",
358
+ " <td>52.23368</td>\n",
359
+ " <td>53.52234</td>\n",
360
+ " <td>59.536080</td>\n",
361
+ " <td>64.175260</td>\n",
362
+ " </tr>\n",
363
+ " </tbody>\n",
364
+ "</table>\n",
365
+ "</div>"
366
+ ],
367
+ "text/plain": [
368
+ " name zero_native zero_self_con zero_cot \\\n",
369
+ "0 Baichuan-13B-Chat 18.3 20.4 28.6 \n",
370
+ "1 Chinese-Alpaca-2-13B 37.7 37.7 49.7 \n",
371
+ "2 GPT-3.5-turbo 66.6 66.8 69.6 \n",
372
+ "3 LLaMA-2-13B 41.8 46.5 53.1 \n",
373
+ "4 Qwen-7B-Chat 45.9 46 47.3 \n",
374
+ "5 ChatGLM2-6B 24.8 24.7 36.6 \n",
375
+ "6 Chinese-LLaMA-2-13B 29.4 29.4 37.8 \n",
376
+ "7 InternLM-7B 38.7 38.7 43.9 \n",
377
+ "8 LLaMA-2-7B 39.5 40 45.4 \n",
378
+ "9 Baichuan2-13B-Chat 14.1 15.3 24.1 \n",
379
+ "10 GPT-4 / / / \n",
380
+ "11 AquilaChat2-34B 36.63 36.63 44.83 \n",
381
+ "12 Yi-34B-Chat 57.75 59.14 65.11 \n",
382
+ "13 DevOps-Model-14B-Chat 30.69 30.59 55.77 \n",
383
+ "14 Qwen-72B-Chat 70.41 70.50 72.38 \n",
384
+ "15 Mistral-7B 29.27 29.27 46.30 \n",
385
+ "16 Qwen-14B-Chat 43.78 47.81 56.58 \n",
386
+ "17 LLaMA-2-70B-Chat 25.29 25.29 57.97 \n",
387
+ "18 ERNIE-Bot-4.0 61.15 61.15 70.00 \n",
388
+ "19 ChatGLM3-6B 43.38487973 43.38487973 44.58762887 \n",
389
+ "20 InternLM2-Chat-20B 56.35738832 56.35738832 26.18025751 \n",
390
+ "21 InternLM2-Chat-7B 49.74226804 49.74226804 56.18556701 \n",
391
+ "22 gemma_2b 26.46048 26.46048 33.41924 \n",
392
+ "23 gemma_7b 25.08591 25.08591 50.85911 \n",
393
+ "24 qwen1.5-14b-base 34.87973 34.87973 60.82474 \n",
394
+ "25 qwen1.5-14b-chat 54.89691 56.4433 64.08935 \n",
395
+ "\n",
396
+ " zero_cot_self_con few_native few_self_con few_cot few_cot_self_con \n",
397
+ "0 37 24.1 26.7 18.200000 17.800000 \n",
398
+ "1 49.7 48.6 48.6 50.500000 50.500000 \n",
399
+ "2 72 68.3 68.3 70.900000 72.500000 \n",
400
+ "3 58.7 53.3 53 56.800000 61.000000 \n",
401
+ "4 50.1 52.1 51 48.300000 49.800000 \n",
402
+ "5 36.5 37.6 37.6 40.500000 40.500000 \n",
403
+ "6 37.8 40.4 40.4 28.800000 28.800000 \n",
404
+ "7 43.9 45.2 45.2 51.400000 51.400000 \n",
405
+ "8 49.5 48.2 46.8 52.000000 55.200000 \n",
406
+ "9 25.8 32.3 33.1 25.600000 27.700000 \n",
407
+ "10 / / / 88.700000 88.700000 \n",
408
+ "11 44.83 46.65 46.65 NaN NaN \n",
409
+ "12 68.79 68.16 68.37 78.090000 80.060000 \n",
410
+ "13 63.63 63.85 61.96 41.150000 44.010000 \n",
411
+ "14 72.56 70.32 70.32 70.130000 70.220000 \n",
412
+ "15 46.30 47.22 47.22 45.580000 45.580000 \n",
413
+ "16 59.40 62.09 59.70 49.060000 55.880000 \n",
414
+ "17 58.06 52.97 52.97 58.550000 58.550000 \n",
415
+ "18 70.00 60.00 60.00 70.000000 70.000000 \n",
416
+ "19 44.58762887 42.09621993 42.09621993 43.470790 43.470790 \n",
417
+ "20 26.18025751 60.48109966 60.48109966 45.103093 45.103093 \n",
418
+ "21 56.18556701 48.19587629 48.19587629 49.742268 49.742268 \n",
419
+ "22 33.41924 26.6323 26.6323 37.542960 37.542960 \n",
420
+ "23 50.85911 30.24055 30.24055 51.557470 51.557470 \n",
421
+ "24 60.82474 65.54983 65.54983 47.079040 47.079040 \n",
422
+ "25 67.09622 52.23368 53.52234 59.536080 64.175260 "
423
+ ]
424
+ },
425
+ "execution_count": 2,
426
+ "metadata": {},
427
+ "output_type": "execute_result"
428
+ }
429
+ ],
430
+ "source": [
431
+ "df = pd.read_csv(\"./data/network_en_mc.csv\")\n",
432
+ "df"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": 20,
438
+ "metadata": {},
439
+ "outputs": [
440
+ {
441
+ "data": {
442
+ "text/html": [
443
+ "<div>\n",
444
+ "<style scoped>\n",
445
+ " .dataframe tbody tr th:only-of-type {\n",
446
+ " vertical-align: middle;\n",
447
+ " }\n",
448
+ "\n",
449
+ " .dataframe tbody tr th {\n",
450
+ " vertical-align: top;\n",
451
+ " }\n",
452
+ "\n",
453
+ " .dataframe thead th {\n",
454
+ " text-align: right;\n",
455
+ " }\n",
456
+ "</style>\n",
457
+ "<table border=\"1\" class=\"dataframe\">\n",
458
+ " <thead>\n",
459
+ " <tr style=\"text-align: right;\">\n",
460
+ " <th></th>\n",
461
+ " <th>Naive</th>\n",
462
+ " <th>SC</th>\n",
463
+ " <th>CoT</th>\n",
464
+ " <th>CoT+SC</th>\n",
465
+ " </tr>\n",
466
+ " </thead>\n",
467
+ " <tbody>\n",
468
+ " <tr>\n",
469
+ " <th>0</th>\n",
470
+ " <td>NaN</td>\n",
471
+ " <td>NaN</td>\n",
472
+ " <td>NaN</td>\n",
473
+ " <td>NaN</td>\n",
474
+ " </tr>\n",
475
+ " <tr>\n",
476
+ " <th>1</th>\n",
477
+ " <td>57.75</td>\n",
478
+ " <td>59.14</td>\n",
479
+ " <td>65.11</td>\n",
480
+ " <td>68.79</td>\n",
481
+ " </tr>\n",
482
+ " <tr>\n",
483
+ " <th>2</th>\n",
484
+ " <td>70.41</td>\n",
485
+ " <td>70.50</td>\n",
486
+ " <td>72.38</td>\n",
487
+ " <td>72.56</td>\n",
488
+ " </tr>\n",
489
+ " <tr>\n",
490
+ " <th>3</th>\n",
491
+ " <td>66.60</td>\n",
492
+ " <td>66.80</td>\n",
493
+ " <td>69.60</td>\n",
494
+ " <td>72.00</td>\n",
495
+ " </tr>\n",
496
+ " <tr>\n",
497
+ " <th>4</th>\n",
498
+ " <td>61.15</td>\n",
499
+ " <td>61.15</td>\n",
500
+ " <td>70.00</td>\n",
501
+ " <td>70.00</td>\n",
502
+ " </tr>\n",
503
+ " <tr>\n",
504
+ " <th>5</th>\n",
505
+ " <td>54.90</td>\n",
506
+ " <td>56.44</td>\n",
507
+ " <td>64.09</td>\n",
508
+ " <td>67.10</td>\n",
509
+ " </tr>\n",
510
+ " <tr>\n",
511
+ " <th>6</th>\n",
512
+ " <td>34.88</td>\n",
513
+ " <td>34.88</td>\n",
514
+ " <td>60.82</td>\n",
515
+ " <td>60.82</td>\n",
516
+ " </tr>\n",
517
+ " <tr>\n",
518
+ " <th>7</th>\n",
519
+ " <td>30.69</td>\n",
520
+ " <td>30.59</td>\n",
521
+ " <td>55.77</td>\n",
522
+ " <td>63.63</td>\n",
523
+ " </tr>\n",
524
+ " <tr>\n",
525
+ " <th>8</th>\n",
526
+ " <td>43.78</td>\n",
527
+ " <td>47.81</td>\n",
528
+ " <td>56.58</td>\n",
529
+ " <td>59.40</td>\n",
530
+ " </tr>\n",
531
+ " <tr>\n",
532
+ " <th>9</th>\n",
533
+ " <td>41.80</td>\n",
534
+ " <td>46.50</td>\n",
535
+ " <td>53.10</td>\n",
536
+ " <td>58.70</td>\n",
537
+ " </tr>\n",
538
+ " <tr>\n",
539
+ " <th>10</th>\n",
540
+ " <td>56.36</td>\n",
541
+ " <td>56.36</td>\n",
542
+ " <td>26.18</td>\n",
543
+ " <td>26.18</td>\n",
544
+ " </tr>\n",
545
+ " <tr>\n",
546
+ " <th>11</th>\n",
547
+ " <td>25.29</td>\n",
548
+ " <td>25.29</td>\n",
549
+ " <td>57.97</td>\n",
550
+ " <td>58.06</td>\n",
551
+ " </tr>\n",
552
+ " <tr>\n",
553
+ " <th>12</th>\n",
554
+ " <td>49.74</td>\n",
555
+ " <td>49.74</td>\n",
556
+ " <td>56.19</td>\n",
557
+ " <td>56.19</td>\n",
558
+ " </tr>\n",
559
+ " <tr>\n",
560
+ " <th>13</th>\n",
561
+ " <td>39.50</td>\n",
562
+ " <td>40.00</td>\n",
563
+ " <td>45.40</td>\n",
564
+ " <td>49.50</td>\n",
565
+ " </tr>\n",
566
+ " <tr>\n",
567
+ " <th>14</th>\n",
568
+ " <td>45.90</td>\n",
569
+ " <td>46.00</td>\n",
570
+ " <td>47.30</td>\n",
571
+ " <td>50.10</td>\n",
572
+ " </tr>\n",
573
+ " <tr>\n",
574
+ " <th>15</th>\n",
575
+ " <td>25.09</td>\n",
576
+ " <td>25.09</td>\n",
577
+ " <td>50.86</td>\n",
578
+ " <td>50.86</td>\n",
579
+ " </tr>\n",
580
+ " <tr>\n",
581
+ " <th>16</th>\n",
582
+ " <td>38.70</td>\n",
583
+ " <td>38.70</td>\n",
584
+ " <td>43.90</td>\n",
585
+ " <td>43.90</td>\n",
586
+ " </tr>\n",
587
+ " <tr>\n",
588
+ " <th>17</th>\n",
589
+ " <td>37.70</td>\n",
590
+ " <td>37.70</td>\n",
591
+ " <td>49.70</td>\n",
592
+ " <td>49.70</td>\n",
593
+ " </tr>\n",
594
+ " <tr>\n",
595
+ " <th>18</th>\n",
596
+ " <td>29.27</td>\n",
597
+ " <td>29.27</td>\n",
598
+ " <td>46.30</td>\n",
599
+ " <td>46.30</td>\n",
600
+ " </tr>\n",
601
+ " <tr>\n",
602
+ " <th>19</th>\n",
603
+ " <td>36.63</td>\n",
604
+ " <td>36.63</td>\n",
605
+ " <td>44.83</td>\n",
606
+ " <td>44.83</td>\n",
607
+ " </tr>\n",
608
+ " <tr>\n",
609
+ " <th>20</th>\n",
610
+ " <td>43.38</td>\n",
611
+ " <td>43.38</td>\n",
612
+ " <td>44.59</td>\n",
613
+ " <td>44.59</td>\n",
614
+ " </tr>\n",
615
+ " <tr>\n",
616
+ " <th>21</th>\n",
617
+ " <td>24.80</td>\n",
618
+ " <td>24.70</td>\n",
619
+ " <td>36.60</td>\n",
620
+ " <td>36.50</td>\n",
621
+ " </tr>\n",
622
+ " <tr>\n",
623
+ " <th>22</th>\n",
624
+ " <td>29.40</td>\n",
625
+ " <td>29.40</td>\n",
626
+ " <td>37.80</td>\n",
627
+ " <td>37.80</td>\n",
628
+ " </tr>\n",
629
+ " <tr>\n",
630
+ " <th>23</th>\n",
631
+ " <td>26.46</td>\n",
632
+ " <td>26.46</td>\n",
633
+ " <td>33.42</td>\n",
634
+ " <td>33.42</td>\n",
635
+ " </tr>\n",
636
+ " <tr>\n",
637
+ " <th>24</th>\n",
638
+ " <td>18.30</td>\n",
639
+ " <td>20.40</td>\n",
640
+ " <td>28.60</td>\n",
641
+ " <td>37.00</td>\n",
642
+ " </tr>\n",
643
+ " <tr>\n",
644
+ " <th>25</th>\n",
645
+ " <td>14.10</td>\n",
646
+ " <td>15.30</td>\n",
647
+ " <td>24.10</td>\n",
648
+ " <td>25.80</td>\n",
649
+ " </tr>\n",
650
+ " </tbody>\n",
651
+ "</table>\n",
652
+ "</div>"
653
+ ],
654
+ "text/plain": [
655
+ " Naive SC CoT CoT+SC\n",
656
+ "0 NaN NaN NaN NaN\n",
657
+ "1 57.75 59.14 65.11 68.79\n",
658
+ "2 70.41 70.50 72.38 72.56\n",
659
+ "3 66.60 66.80 69.60 72.00\n",
660
+ "4 61.15 61.15 70.00 70.00\n",
661
+ "5 54.90 56.44 64.09 67.10\n",
662
+ "6 34.88 34.88 60.82 60.82\n",
663
+ "7 30.69 30.59 55.77 63.63\n",
664
+ "8 43.78 47.81 56.58 59.40\n",
665
+ "9 41.80 46.50 53.10 58.70\n",
666
+ "10 56.36 56.36 26.18 26.18\n",
667
+ "11 25.29 25.29 57.97 58.06\n",
668
+ "12 49.74 49.74 56.19 56.19\n",
669
+ "13 39.50 40.00 45.40 49.50\n",
670
+ "14 45.90 46.00 47.30 50.10\n",
671
+ "15 25.09 25.09 50.86 50.86\n",
672
+ "16 38.70 38.70 43.90 43.90\n",
673
+ "17 37.70 37.70 49.70 49.70\n",
674
+ "18 29.27 29.27 46.30 46.30\n",
675
+ "19 36.63 36.63 44.83 44.83\n",
676
+ "20 43.38 43.38 44.59 44.59\n",
677
+ "21 24.80 24.70 36.60 36.50\n",
678
+ "22 29.40 29.40 37.80 37.80\n",
679
+ "23 26.46 26.46 33.42 33.42\n",
680
+ "24 18.30 20.40 28.60 37.00\n",
681
+ "25 14.10 15.30 24.10 25.80"
682
+ ]
683
+ },
684
+ "execution_count": 20,
685
+ "metadata": {},
686
+ "output_type": "execute_result"
687
+ }
688
+ ],
689
+ "source": [
690
+ "def process_mc_df(df):\n",
691
+ " # 将name列重命名为Model\n",
692
+ " df = df.rename(columns={\"name\": \"Model\"})\n",
693
+ " # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency\n",
694
+ " df = df.set_index(\"Model\")\n",
695
+ " # df = df.stack().unstack()\n",
696
+ " df.columns = pd.MultiIndex.from_tuples([(\"Zeroshot\", \"Naive\"), (\"Zeroshot\", \"SC\"), (\"Zeroshot\", \"CoT\"), (\"Zeroshot\", \"CoT+SC\"), (\"Fewshot\", \"Naive\"), (\"Fewshot\", \"SC\"), (\"Fewshot\", \"CoT\"), (\"Fewshot\", \"CoT+SC\")])\n",
697
+ " # 将除了Model列之外的列的value转换为数值型,失败的为NaN\n",
698
+ " df = df.apply(pd.to_numeric, errors=\"coerce\")\n",
699
+ " # 显示小数点后两位\n",
700
+ " df = df.round(2)\n",
701
+ " # 给每一行添加���列BestScore\n",
702
+ " df[\"BestScore\"] = df.max(axis=1)\n",
703
+ " # 根据BestScore给df排序\n",
704
+ " df = df.sort_values(by=\"BestScore\", ascending=False)\n",
705
+ " # \n",
706
+ " df = df.reset_index()\n",
707
+ " return df\n",
708
+ "\n",
709
+ "processed = process_mc_df(df)\n",
710
+ "processed.columns\n",
711
+ "processed['Zeroshot']"
712
+ ]
713
+ },
714
+ {
715
+ "cell_type": "code",
716
+ "execution_count": null,
717
+ "metadata": {},
718
+ "outputs": [],
719
+ "source": []
720
+ }
721
+ ],
722
+ "metadata": {
723
+ "kernelspec": {
724
+ "display_name": "opencompass",
725
+ "language": "python",
726
+ "name": "python3"
727
+ },
728
+ "language_info": {
729
+ "codemirror_mode": {
730
+ "name": "ipython",
731
+ "version": 3
732
+ },
733
+ "file_extension": ".py",
734
+ "mimetype": "text/x-python",
735
+ "name": "python",
736
+ "nbconvert_exporter": "python",
737
+ "pygments_lexer": "ipython3",
738
+ "version": "3.10.14"
739
+ }
740
+ },
741
+ "nbformat": 4,
742
+ "nbformat_minor": 2
743
+ }
leaderboards.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ eng_leaderboards = [
2
+ ('zte', ['mc']),
3
+ ('lenovo', ['mc']),
4
+ ('oracle', ['mc']),
5
+ ('network', ['mc', 'qa']),
6
+ ]
7
+
8
+ chi_leaderboards = [
9
+ ('huaweicloud', ['mc']),
10
+ ('gtja', ['mc']),
11
+ ('zjyd', ['mc', 'qa']),
12
+ ('network', ['mc', 'qa']),
13
+ ('pufa', ['mc']),
14
+ ('zabbix', ['mc']),
15
+ ('dfcdata', ['mc']),
16
+ ('zte', ['mc']),
17
+ ('oracle', ['mc']),
18
+ ('tencent', ['qa']),
19
+ ('bosc', ['mc']),
20
+ ('rzy', ['mc']),
21
+ ('lenovo', ['mc']),
22
+ ]
opseval_datasets.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets_abbr = [
2
+ 'huaweicloud',
3
+ 'gtja',
4
+ 'zjyd',
5
+ 'network',
6
+ 'pufa',
7
+ 'zabbix',
8
+ 'dfcdata',
9
+ 'zte',
10
+ 'oracle',
11
+ 'tencent',
12
+ 'bosc',
13
+ 'rzy',
14
+ 'lenovo'
15
+ ]
16
+
17
+ datasets_zh = [
18
+ '5G通信运维(华为核心网)',
19
+ '证券信息系统运维(国泰君安)',
20
+ '中国移动浙江公司',
21
+ '有线网络运维(清华Netman)',
22
+ '金融IT运维(浦发银行)',
23
+ '运维监控能力测评(Zabbix中国宏时数据)',
24
+ '数据库运维(基石数据)',
25
+ '5G通信网络运维(中兴通信)',
26
+ 'Oracle数据库运维(中亦科技)',
27
+ 'DevOps能力评测(腾讯)',
28
+ '金融信创系统运维(上海银行)',
29
+ '日志分析能力评测(日志易)',
30
+ '混合云建设与运维(联想集团)'
31
+ ]
32
+
33
+ datasets_en = [
34
+ "5G Telecommunications",
35
+ "Securities Information System",
36
+ "China Mobile Zhejiang",
37
+ "Wired Network Operations",
38
+ "Financial IT",
39
+ "Operations Monitoring Capability",
40
+ "Database",
41
+ "5G Telecommunications Network",
42
+ "Oracle Database",
43
+ "DevOps Capability",
44
+ "Financial New Generation System",
45
+ "Log Analysis",
46
+ "Hybrid Cloud Construction and Operations"
47
+ ]
48
+
49
+ dataset_abbr_zh_dict = {
50
+ da: dz for da, dz in zip(datasets_abbr, datasets_zh)
51
+ }
52
+
53
+ dataset_abbr_en_dict = {
54
+ da: de for da, de in zip(datasets_abbr, datasets_en)
55
+ }
56
+
57
+ dataset_zh_en_dict = {
58
+ dz: de for dz, de in zip(datasets_zh, datasets_en)
59
+ }
60
+
61
+ dataset_en_zh_dict = {
62
+ de: dz for dz, de in zip(datasets_zh, datasets_en)
63
+ }