Spaces:
Sleeping
Sleeping
davidkim205
commited on
Commit
ยท
2b5689e
1
Parent(s):
ce9edc9
Add application file
Browse files
app.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import plotly.express as px
|
5 |
+
import random
|
6 |
+
import plotly.graph_objects as go
|
7 |
+
|
8 |
+
file_result_score = 'ko_bench.csv'
|
9 |
+
|
10 |
+
file_full_lb = 'mt_bench_240805.csv'
|
11 |
+
|
12 |
+
|
13 |
+
# read csv
|
14 |
+
df_result_score = pd.read_csv(file_result_score)
|
15 |
+
df_full_lb = pd.read_csv(file_full_lb)
|
16 |
+
|
17 |
+
|
18 |
+
# dataframe
|
19 |
+
df = pd.DataFrame(df_result_score)
|
20 |
+
df_rs = pd.DataFrame(df_result_score)
|
21 |
+
df_full_lboard = pd.DataFrame(df_full_lb)
|
22 |
+
|
23 |
+
df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋ก ๋ณ๊ฒฝ
|
24 |
+
models = df_full_lboard['Model'].unique() # ์ด ์ถ๊ฐ๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
25 |
+
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
26 |
+
|
27 |
+
def custom_mean(series):
|
28 |
+
numeric_series = pd.to_numeric(series, errors='coerce') # ์๋ฆฌ์ฆ๋ฅผ ์ซ์๋ก ๋ณํ
|
29 |
+
return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์๋ ๊ฐ์ด ํ๋๋ผ๋ ์์ผ๋ฉด ํ๊ท ๊ณ์ฐ
|
30 |
+
|
31 |
+
def get_mt_bench(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
32 |
+
model_lower = model.lower()
|
33 |
+
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
34 |
+
if not matching_rows.empty:
|
35 |
+
return matching_rows['MT-bench (score)'].values[0]
|
36 |
+
return ''
|
37 |
+
|
38 |
+
def get_organization(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
39 |
+
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
40 |
+
return 'Mistral'
|
41 |
+
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
42 |
+
return 'KISTI'
|
43 |
+
|
44 |
+
model_lower = model.lower()
|
45 |
+
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
46 |
+
if not matching_rows.empty:
|
47 |
+
return matching_rows['Organization'].values[0]
|
48 |
+
return ''
|
49 |
+
|
50 |
+
def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
51 |
+
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
52 |
+
return 'Apache-2.0'
|
53 |
+
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
54 |
+
return 'llama3'
|
55 |
+
|
56 |
+
model_lower = model.lower()
|
57 |
+
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
58 |
+
if not matching_rows.empty:
|
59 |
+
return matching_rows['License'].values[0]
|
60 |
+
return ''
|
61 |
+
|
62 |
+
|
63 |
+
# dataframe_full
|
64 |
+
df_full_rs = df_rs.copy()
|
65 |
+
df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True)
|
66 |
+
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
67 |
+
|
68 |
+
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
69 |
+
df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model', 'judge_model']}).reset_index()
|
70 |
+
df_full_rs = df_full_rs.round(2)
|
71 |
+
df_full_rs.replace("", np.nan, inplace=True)
|
72 |
+
|
73 |
+
df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval ์ด ์ถ๊ฐ
|
74 |
+
df_full_rs['KO-Bench/keval'] = ''
|
75 |
+
for idx, j_model in df_full_rs['judge_model'].items():
|
76 |
+
if j_model == 'keval':
|
77 |
+
df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench']
|
78 |
+
else :
|
79 |
+
df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench']
|
80 |
+
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
81 |
+
|
82 |
+
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval ํ ํฉ๋ณ
|
83 |
+
df_full_rs = df_full_rs.round(2)
|
84 |
+
df_full_rs.replace("", np.nan, inplace=True)
|
85 |
+
|
86 |
+
df_full_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ
|
87 |
+
df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench)
|
88 |
+
df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
|
89 |
+
|
90 |
+
df_full_rs['Organization'] = '' # Organization ์ด ์ถ๊ฐ
|
91 |
+
df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
|
92 |
+
|
93 |
+
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
94 |
+
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
95 |
+
|
96 |
+
df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False)
|
97 |
+
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
98 |
+
df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
|
99 |
+
|
100 |
+
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
101 |
+
|
102 |
+
|
103 |
+
# dataframe
|
104 |
+
df_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ
|
105 |
+
df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench)
|
106 |
+
df_rs['MT-Bench'] = df_rs['MT-Bench'].str.replace('-', '', regex=False)
|
107 |
+
|
108 |
+
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
109 |
+
|
110 |
+
|
111 |
+
# dataframe_openai
|
112 |
+
df_openai = pd.DataFrame(df_rs)
|
113 |
+
df_openai = df_openai[df_openai['judge_model'] != 'keval']
|
114 |
+
|
115 |
+
df_openai = df_openai.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
116 |
+
df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
|
117 |
+
df_openai = df_openai.round(2)
|
118 |
+
|
119 |
+
df_openai = df_openai.sort_values(by='score', ascending=False)
|
120 |
+
df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
|
121 |
+
|
122 |
+
|
123 |
+
# dataframe_keval
|
124 |
+
df_keval = pd.DataFrame(df_rs)
|
125 |
+
df_keval = df_keval[df_keval['judge_model'] == 'keval']
|
126 |
+
|
127 |
+
df_keval = df_keval.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
128 |
+
df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
|
129 |
+
df_keval = df_keval.round(2)
|
130 |
+
|
131 |
+
df_keval = df_keval.sort_values(by='score', ascending=False)
|
132 |
+
df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
|
133 |
+
|
134 |
+
|
135 |
+
# model detail view
|
136 |
+
plot_models_list = plot_models.tolist()
|
137 |
+
CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
|
138 |
+
category_labels = ['Selected model turn1', 'Selected model turn2', 'Top1 turn1', 'Top1 turn2']
|
139 |
+
random.seed(42)
|
140 |
+
|
141 |
+
def search_dataframe(query): # df ๊ฒ์ ํจ์ ์ ์
|
142 |
+
if not query:
|
143 |
+
return df # ๊ฒ์์ด๊ฐ ์์ ๊ฒฝ์ฐ ์ ์ฒด DataFrame ๋ฐํ
|
144 |
+
filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
|
145 |
+
return filtered_df
|
146 |
+
|
147 |
+
def radar_chart(categories, Selected_model_turn1, Selected_model_turn2, Top1_turn1, Top1_turn2): # plot ๊ทธ๋ฆฌ๋ ํจ์
|
148 |
+
#categories = categories.split(',')
|
149 |
+
|
150 |
+
Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
|
151 |
+
Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]
|
152 |
+
Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
|
153 |
+
Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
|
154 |
+
|
155 |
+
values_lists = [
|
156 |
+
list(map(float, Selected_model_turn1)),
|
157 |
+
list(map(float, Selected_model_turn2)),
|
158 |
+
list(map(float, Top1_turn1)),
|
159 |
+
list(map(float, Top1_turn2))
|
160 |
+
]
|
161 |
+
|
162 |
+
fig = go.Figure()
|
163 |
+
|
164 |
+
for i, values in enumerate(values_lists):
|
165 |
+
if len(categories) != len(values):
|
166 |
+
return f"Error in dataset {i+1}: Number of categories and values must be the same."
|
167 |
+
|
168 |
+
fig.add_trace(go.Scatterpolar(
|
169 |
+
r=values + [values[0]], # Closing the loop of the radar chart
|
170 |
+
theta=categories + [categories[0]], # Closing the loop of the radar chart
|
171 |
+
mode='lines',
|
172 |
+
name=category_labels[i] # Label for the dataset
|
173 |
+
))
|
174 |
+
|
175 |
+
fig.update_layout(
|
176 |
+
polar=dict(
|
177 |
+
radialaxis=dict(
|
178 |
+
visible=True,
|
179 |
+
range=[0, max(max(values) for values in values_lists)],
|
180 |
+
showline=True,
|
181 |
+
),
|
182 |
+
angularaxis=dict(
|
183 |
+
rotation=0,
|
184 |
+
direction='clockwise'
|
185 |
+
)
|
186 |
+
),
|
187 |
+
showlegend=True,
|
188 |
+
width=555, # ์ ์ ํ ๋๋น ์ค์
|
189 |
+
height=550, # ์ ์ ํ ๋์ด ์ค์
|
190 |
+
margin=dict(l=1000, r=20, t=20, b=20),
|
191 |
+
autosize = False,
|
192 |
+
paper_bgcolor='white',
|
193 |
+
plot_bgcolor='lightgrey'
|
194 |
+
)
|
195 |
+
return fig
|
196 |
+
|
197 |
+
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
198 |
+
condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
199 |
+
openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
200 |
+
|
201 |
+
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
|
202 |
+
openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
203 |
+
|
204 |
+
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.loc[0,'model'])
|
205 |
+
top1_openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
|
206 |
+
|
207 |
+
condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.loc[0,'model'])
|
208 |
+
top1_openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
209 |
+
|
210 |
+
fig = radar_chart(CATEGORIES, openai_turn1, openai_turn2, top1_openai_turn1, top1_openai_turn2)
|
211 |
+
return fig
|
212 |
+
|
213 |
+
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
214 |
+
condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
215 |
+
keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
216 |
+
|
217 |
+
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
|
218 |
+
keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
219 |
+
|
220 |
+
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.loc[0,'model'])
|
221 |
+
top1_keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
|
222 |
+
|
223 |
+
condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.loc[0,'model'])
|
224 |
+
top1_keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
225 |
+
|
226 |
+
fig = radar_chart(CATEGORIES, keval_turn1, keval_turn2, top1_keval_turn1, top1_keval_turn2)
|
227 |
+
return fig
|
228 |
+
|
229 |
+
|
230 |
+
#gradio
|
231 |
+
with gr.Blocks() as demo:
|
232 |
+
gr.Markdown("")
|
233 |
+
gr.Markdown("# ๐ KO-Bench Leaderboard")
|
234 |
+
gr.Markdown("")
|
235 |
+
gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
236 |
+
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
237 |
+
gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
|
238 |
+
gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
|
239 |
+
gr.Markdown("")
|
240 |
+
gr.Markdown("github : https://github.com/davidkim205/ko-bench")
|
241 |
+
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
242 |
+
gr.Markdown("")
|
243 |
+
|
244 |
+
with gr.TabItem("KO-Bench"):
|
245 |
+
gr.Dataframe(value=df_full_rs)
|
246 |
+
with gr.TabItem("Openai Judgment"):
|
247 |
+
gr.Dataframe(value=df_openai)
|
248 |
+
with gr.TabItem("Keval Judgment"):
|
249 |
+
gr.Dataframe(value=df_keval)
|
250 |
+
with gr.TabItem("Model Detail View"):
|
251 |
+
with gr.Blocks():
|
252 |
+
with gr.Row():
|
253 |
+
dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
|
254 |
+
with gr.Row():
|
255 |
+
dataframe = gr.Dataframe(label="Model Detail View")
|
256 |
+
dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
|
257 |
+
with gr.Row():
|
258 |
+
plot_openai = gr.Plot(label="Openai Plot")
|
259 |
+
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
|
260 |
+
#with gr.Row():
|
261 |
+
plot_keval = gr.Plot(label="Keval Plot")
|
262 |
+
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
|
263 |
+
|
264 |
+
demo.launch(share=True, server_name="0.0.0.0")
|