Spaces:
Sleeping
Sleeping
davidkim205
commited on
Commit
ยท
2d3d046
1
Parent(s):
7172210
update
Browse files
app.py
CHANGED
@@ -62,7 +62,7 @@ def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์
|
|
62 |
|
63 |
# dataframe_full
|
64 |
df_full_rs = df_rs.copy()
|
65 |
-
df_full_rs.rename(columns={'score': '
|
66 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
67 |
|
68 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
@@ -70,16 +70,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
|
|
70 |
df_full_rs = df_full_rs.round(2)
|
71 |
df_full_rs.replace("", np.nan, inplace=True)
|
72 |
|
73 |
-
df_full_rs['
|
74 |
-
df_full_rs['
|
75 |
for idx, j_model in df_full_rs['judge_model'].items():
|
76 |
if j_model == 'keval':
|
77 |
-
df_full_rs.at[idx, '
|
78 |
else :
|
79 |
-
df_full_rs.at[idx, '
|
80 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
81 |
|
82 |
-
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() #
|
83 |
df_full_rs = df_full_rs.round(2)
|
84 |
df_full_rs.replace("", np.nan, inplace=True)
|
85 |
|
@@ -93,9 +93,9 @@ df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
|
|
93 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
94 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
95 |
|
96 |
-
df_full_rs = df_full_rs.sort_values(by='
|
97 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
98 |
-
df_full_rs = df_full_rs.drop(columns=['
|
99 |
|
100 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
101 |
|
@@ -230,18 +230,18 @@ def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
|
230 |
#gradio
|
231 |
with gr.Blocks() as demo:
|
232 |
gr.Markdown("")
|
233 |
-
gr.Markdown("# ๐
|
234 |
gr.Markdown("")
|
235 |
-
gr.Markdown("#### The Ko-
|
236 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
237 |
-
gr.Markdown("-
|
238 |
-
gr.Markdown("-
|
239 |
gr.Markdown("")
|
240 |
-
gr.Markdown("github : https://github.com/davidkim205/
|
241 |
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
242 |
gr.Markdown("")
|
243 |
|
244 |
-
with gr.TabItem("
|
245 |
gr.Dataframe(value=df_full_rs)
|
246 |
with gr.TabItem("Openai Judgment"):
|
247 |
gr.Dataframe(value=df_openai)
|
|
|
62 |
|
63 |
# dataframe_full
|
64 |
df_full_rs = df_rs.copy()
|
65 |
+
df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
|
66 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
67 |
|
68 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
|
|
70 |
df_full_rs = df_full_rs.round(2)
|
71 |
df_full_rs.replace("", np.nan, inplace=True)
|
72 |
|
73 |
+
df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval ์ด ์ถ๊ฐ
|
74 |
+
df_full_rs['Ko-Bench/keval'] = ''
|
75 |
for idx, j_model in df_full_rs['judge_model'].items():
|
76 |
if j_model == 'keval':
|
77 |
+
df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
|
78 |
else :
|
79 |
+
df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
|
80 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
81 |
|
82 |
+
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval ํ ํฉ๋ณ
|
83 |
df_full_rs = df_full_rs.round(2)
|
84 |
df_full_rs.replace("", np.nan, inplace=True)
|
85 |
|
|
|
93 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
94 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
95 |
|
96 |
+
df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
|
97 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
98 |
+
df_full_rs = df_full_rs.drop(columns=['Ko-Bench'])
|
99 |
|
100 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
101 |
|
|
|
230 |
#gradio
|
231 |
with gr.Blocks() as demo:
|
232 |
gr.Markdown("")
|
233 |
+
gr.Markdown("# ๐ Ko-Bench Leaderboard")
|
234 |
gr.Markdown("")
|
235 |
+
gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
236 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
237 |
+
gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
|
238 |
+
gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
|
239 |
gr.Markdown("")
|
240 |
+
gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
|
241 |
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
242 |
gr.Markdown("")
|
243 |
|
244 |
+
with gr.TabItem("Ko-Bench"):
|
245 |
gr.Dataframe(value=df_full_rs)
|
246 |
with gr.TabItem("Openai Judgment"):
|
247 |
gr.Dataframe(value=df_openai)
|