davidkim205 commited on
Commit
2d3d046
ยท
1 Parent(s): 7172210
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -62,7 +62,7 @@ def get_license(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„
62
 
63
  # dataframe_full
64
  df_full_rs = df_rs.copy()
65
- df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True)
66
  df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
67
 
68
  df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
@@ -70,16 +70,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
70
  df_full_rs = df_full_rs.round(2)
71
  df_full_rs.replace("", np.nan, inplace=True)
72
 
73
- df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval ์—ด ์ถ”๊ฐ€
74
- df_full_rs['KO-Bench/keval'] = ''
75
  for idx, j_model in df_full_rs['judge_model'].items():
76
  if j_model == 'keval':
77
- df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench']
78
  else :
79
- df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench']
80
  df_full_rs = df_full_rs.drop(columns=['judge_model'])
81
 
82
- df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
83
  df_full_rs = df_full_rs.round(2)
84
  df_full_rs.replace("", np.nan, inplace=True)
85
 
@@ -93,9 +93,9 @@ df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
93
  df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
94
  df_full_rs['License'] = df_full_rs['model'].apply(get_license)
95
 
96
- df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False)
97
  df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
98
- df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
99
 
100
  plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
101
 
@@ -230,18 +230,18 @@ def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
230
  #gradio
231
  with gr.Blocks() as demo:
232
  gr.Markdown("")
233
- gr.Markdown("# ๐Ÿ† KO-Bench Leaderboard")
234
  gr.Markdown("")
235
- gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
236
  gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
237
- gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
238
- gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
239
  gr.Markdown("")
240
- gr.Markdown("github : https://github.com/davidkim205/ko-bench")
241
  gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
242
  gr.Markdown("")
243
 
244
- with gr.TabItem("KO-Bench"):
245
  gr.Dataframe(value=df_full_rs)
246
  with gr.TabItem("Openai Judgment"):
247
  gr.Dataframe(value=df_openai)
 
62
 
63
  # dataframe_full
64
  df_full_rs = df_rs.copy()
65
+ df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
66
  df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
67
 
68
  df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
 
70
  df_full_rs = df_full_rs.round(2)
71
  df_full_rs.replace("", np.nan, inplace=True)
72
 
73
+ df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval ์—ด ์ถ”๊ฐ€
74
+ df_full_rs['Ko-Bench/keval'] = ''
75
  for idx, j_model in df_full_rs['judge_model'].items():
76
  if j_model == 'keval':
77
+ df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
78
  else :
79
+ df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
80
  df_full_rs = df_full_rs.drop(columns=['judge_model'])
81
 
82
+ df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
83
  df_full_rs = df_full_rs.round(2)
84
  df_full_rs.replace("", np.nan, inplace=True)
85
 
 
93
  df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
94
  df_full_rs['License'] = df_full_rs['model'].apply(get_license)
95
 
96
+ df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
97
  df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
98
+ df_full_rs = df_full_rs.drop(columns=['Ko-Bench'])
99
 
100
  plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
101
 
 
230
  #gradio
231
  with gr.Blocks() as demo:
232
  gr.Markdown("")
233
+ gr.Markdown("# ๐Ÿ† Ko-Bench Leaderboard")
234
  gr.Markdown("")
235
+ gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
236
  gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
237
+ gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
238
+ gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
239
  gr.Markdown("")
240
+ gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
241
  gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
242
  gr.Markdown("")
243
 
244
+ with gr.TabItem("Ko-Bench"):
245
  gr.Dataframe(value=df_full_rs)
246
  with gr.TabItem("Openai Judgment"):
247
  gr.Dataframe(value=df_openai)