davidkim205 commited on
Commit
2b5689e
ยท
1 Parent(s): ce9edc9

Add application file

Browse files
Files changed (1) hide show
  1. app.py +264 -0
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import random
6
+ import plotly.graph_objects as go
7
+
8
+ file_result_score = 'ko_bench.csv'
9
+
10
+ file_full_lb = 'mt_bench_240805.csv'
11
+
12
+
13
+ # read csv
14
+ df_result_score = pd.read_csv(file_result_score)
15
+ df_full_lb = pd.read_csv(file_full_lb)
16
+
17
+
18
+ # dataframe
19
+ df = pd.DataFrame(df_result_score)
20
+ df_rs = pd.DataFrame(df_result_score)
21
+ df_full_lboard = pd.DataFrame(df_full_lb)
22
+
23
+ df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์˜ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋กœ ๋ณ€๊ฒฝ
24
+ models = df_full_lboard['Model'].unique() # ์—ด ์ถ”๊ฐ€๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
25
+ df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
26
+
27
+ def custom_mean(series):
28
+ numeric_series = pd.to_numeric(series, errors='coerce') # ์‹œ๋ฆฌ์ฆˆ๋ฅผ ์ˆซ์ž๋กœ ๋ณ€ํ™˜
29
+ return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์•„๋‹Œ ๊ฐ’์ด ํ•˜๋‚˜๋ผ๋„ ์žˆ์œผ๋ฉด ํ‰๊ท  ๊ณ„์‚ฐ
30
+
31
+ def get_mt_bench(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
32
+ model_lower = model.lower()
33
+ matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
34
+ if not matching_rows.empty:
35
+ return matching_rows['MT-bench (score)'].values[0]
36
+ return ''
37
+
38
+ def get_organization(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
39
+ if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
40
+ return 'Mistral'
41
+ elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
42
+ return 'KISTI'
43
+
44
+ model_lower = model.lower()
45
+ matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
46
+ if not matching_rows.empty:
47
+ return matching_rows['Organization'].values[0]
48
+ return ''
49
+
50
+ def get_license(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
51
+ if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
52
+ return 'Apache-2.0'
53
+ elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
54
+ return 'llama3'
55
+
56
+ model_lower = model.lower()
57
+ matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
58
+ if not matching_rows.empty:
59
+ return matching_rows['License'].values[0]
60
+ return ''
61
+
62
+
63
+ # dataframe_full
64
+ df_full_rs = df_rs.copy()
65
+ df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True)
66
+ df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
67
+
68
+ df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
69
+ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model', 'judge_model']}).reset_index()
70
+ df_full_rs = df_full_rs.round(2)
71
+ df_full_rs.replace("", np.nan, inplace=True)
72
+
73
+ df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval ์—ด ์ถ”๊ฐ€
74
+ df_full_rs['KO-Bench/keval'] = ''
75
+ for idx, j_model in df_full_rs['judge_model'].items():
76
+ if j_model == 'keval':
77
+ df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench']
78
+ else :
79
+ df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench']
80
+ df_full_rs = df_full_rs.drop(columns=['judge_model'])
81
+
82
+ df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
83
+ df_full_rs = df_full_rs.round(2)
84
+ df_full_rs.replace("", np.nan, inplace=True)
85
+
86
+ df_full_rs['MT-Bench'] = '' # MT-Bench ์—ด ์ถ”๊ฐ€
87
+ df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench)
88
+ df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
89
+
90
+ df_full_rs['Organization'] = '' # Organization ์—ด ์ถ”๊ฐ€
91
+ df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
92
+
93
+ df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
94
+ df_full_rs['License'] = df_full_rs['model'].apply(get_license)
95
+
96
+ df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False)
97
+ df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
98
+ df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
99
+
100
+ plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
101
+
102
+
103
+ # dataframe
104
+ df_rs['MT-Bench'] = '' # MT-Bench ์—ด ์ถ”๊ฐ€
105
+ df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench)
106
+ df_rs['MT-Bench'] = df_rs['MT-Bench'].str.replace('-', '', regex=False)
107
+
108
+ df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
109
+
110
+
111
+ # dataframe_openai
112
+ df_openai = pd.DataFrame(df_rs)
113
+ df_openai = df_openai[df_openai['judge_model'] != 'keval']
114
+
115
+ df_openai = df_openai.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
116
+ df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
117
+ df_openai = df_openai.round(2)
118
+
119
+ df_openai = df_openai.sort_values(by='score', ascending=False)
120
+ df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
121
+
122
+
123
+ # dataframe_keval
124
+ df_keval = pd.DataFrame(df_rs)
125
+ df_keval = df_keval[df_keval['judge_model'] == 'keval']
126
+
127
+ df_keval = df_keval.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
128
+ df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
129
+ df_keval = df_keval.round(2)
130
+
131
+ df_keval = df_keval.sort_values(by='score', ascending=False)
132
+ df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
133
+
134
+
135
+ # model detail view
136
+ plot_models_list = plot_models.tolist()
137
+ CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
138
+ category_labels = ['Selected model turn1', 'Selected model turn2', 'Top1 turn1', 'Top1 turn2']
139
+ random.seed(42)
140
+
141
+ def search_dataframe(query): # df ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ์ •์˜
142
+ if not query:
143
+ return df # ๊ฒ€์ƒ‰์–ด๊ฐ€ ์—†์„ ๊ฒฝ์šฐ ์ „์ฒด DataFrame ๋ฐ˜ํ™˜
144
+ filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
145
+ return filtered_df
146
+
147
+ def radar_chart(categories, Selected_model_turn1, Selected_model_turn2, Top1_turn1, Top1_turn2): # plot ๊ทธ๋ฆฌ๋Š” ํ•จ์ˆ˜
148
+ #categories = categories.split(',')
149
+
150
+ Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
151
+ Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]
152
+ Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
153
+ Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
154
+
155
+ values_lists = [
156
+ list(map(float, Selected_model_turn1)),
157
+ list(map(float, Selected_model_turn2)),
158
+ list(map(float, Top1_turn1)),
159
+ list(map(float, Top1_turn2))
160
+ ]
161
+
162
+ fig = go.Figure()
163
+
164
+ for i, values in enumerate(values_lists):
165
+ if len(categories) != len(values):
166
+ return f"Error in dataset {i+1}: Number of categories and values must be the same."
167
+
168
+ fig.add_trace(go.Scatterpolar(
169
+ r=values + [values[0]], # Closing the loop of the radar chart
170
+ theta=categories + [categories[0]], # Closing the loop of the radar chart
171
+ mode='lines',
172
+ name=category_labels[i] # Label for the dataset
173
+ ))
174
+
175
+ fig.update_layout(
176
+ polar=dict(
177
+ radialaxis=dict(
178
+ visible=True,
179
+ range=[0, max(max(values) for values in values_lists)],
180
+ showline=True,
181
+ ),
182
+ angularaxis=dict(
183
+ rotation=0,
184
+ direction='clockwise'
185
+ )
186
+ ),
187
+ showlegend=True,
188
+ width=555, # ์ ์ ˆํ•œ ๋„ˆ๋น„ ์„ค์ •
189
+ height=550, # ์ ์ ˆํ•œ ๋†’์ด ์„ค์ •
190
+ margin=dict(l=1000, r=20, t=20, b=20),
191
+ autosize = False,
192
+ paper_bgcolor='white',
193
+ plot_bgcolor='lightgrey'
194
+ )
195
+ return fig
196
+
197
+ def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
198
+ condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
199
+ openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
200
+
201
+ condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
202
+ openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
203
+
204
+ condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.loc[0,'model'])
205
+ top1_openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
206
+
207
+ condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.loc[0,'model'])
208
+ top1_openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
209
+
210
+ fig = radar_chart(CATEGORIES, openai_turn1, openai_turn2, top1_openai_turn1, top1_openai_turn2)
211
+ return fig
212
+
213
+ def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
214
+ condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
215
+ keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
216
+
217
+ condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
218
+ keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
219
+
220
+ condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.loc[0,'model'])
221
+ top1_keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
222
+
223
+ condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.loc[0,'model'])
224
+ top1_keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
225
+
226
+ fig = radar_chart(CATEGORIES, keval_turn1, keval_turn2, top1_keval_turn1, top1_keval_turn2)
227
+ return fig
228
+
229
+
230
+ #gradio
231
+ with gr.Blocks() as demo:
232
+ gr.Markdown("")
233
+ gr.Markdown("# ๐Ÿ† KO-Bench Leaderboard")
234
+ gr.Markdown("")
235
+ gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
236
+ gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
237
+ gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
238
+ gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
239
+ gr.Markdown("")
240
+ gr.Markdown("github : https://github.com/davidkim205/ko-bench")
241
+ gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
242
+ gr.Markdown("")
243
+
244
+ with gr.TabItem("KO-Bench"):
245
+ gr.Dataframe(value=df_full_rs)
246
+ with gr.TabItem("Openai Judgment"):
247
+ gr.Dataframe(value=df_openai)
248
+ with gr.TabItem("Keval Judgment"):
249
+ gr.Dataframe(value=df_keval)
250
+ with gr.TabItem("Model Detail View"):
251
+ with gr.Blocks():
252
+ with gr.Row():
253
+ dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
254
+ with gr.Row():
255
+ dataframe = gr.Dataframe(label="Model Detail View")
256
+ dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
257
+ with gr.Row():
258
+ plot_openai = gr.Plot(label="Openai Plot")
259
+ dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
260
+ #with gr.Row():
261
+ plot_keval = gr.Plot(label="Keval Plot")
262
+ dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
263
+
264
+ demo.launch(share=True, server_name="0.0.0.0")