Spaces:
Sleeping
Sleeping
davidkim205
commited on
Commit
ยท
d05d8fb
1
Parent(s):
2d3d046
update ko_bench
Browse files- app.py +71 -49
- ko_bench.csv +4 -0
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
-
import plotly.express as px
|
5 |
import random
|
6 |
import plotly.graph_objects as go
|
7 |
|
@@ -62,7 +61,7 @@ def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์
|
|
62 |
|
63 |
# dataframe_full
|
64 |
df_full_rs = df_rs.copy()
|
65 |
-
df_full_rs.rename(columns={'score': '
|
66 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
67 |
|
68 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
@@ -70,16 +69,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
|
|
70 |
df_full_rs = df_full_rs.round(2)
|
71 |
df_full_rs.replace("", np.nan, inplace=True)
|
72 |
|
73 |
-
df_full_rs['
|
74 |
-
df_full_rs['
|
75 |
for idx, j_model in df_full_rs['judge_model'].items():
|
76 |
if j_model == 'keval':
|
77 |
-
df_full_rs.at[idx, '
|
78 |
else :
|
79 |
-
df_full_rs.at[idx, '
|
80 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
81 |
|
82 |
-
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() #
|
83 |
df_full_rs = df_full_rs.round(2)
|
84 |
df_full_rs.replace("", np.nan, inplace=True)
|
85 |
|
@@ -93,9 +92,9 @@ df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
|
|
93 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
94 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
95 |
|
96 |
-
df_full_rs = df_full_rs.sort_values(by='
|
97 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
98 |
-
df_full_rs = df_full_rs.drop(columns=['
|
99 |
|
100 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
101 |
|
@@ -135,7 +134,8 @@ df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
|
|
135 |
# model detail view
|
136 |
plot_models_list = plot_models.tolist()
|
137 |
CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
|
138 |
-
|
|
|
139 |
random.seed(42)
|
140 |
|
141 |
def search_dataframe(query): # df ๊ฒ์ ํจ์ ์ ์
|
@@ -144,32 +144,36 @@ def search_dataframe(query): # df ๊ฒ์ ํจ์ ์ ์
|
|
144 |
filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
|
145 |
return filtered_df
|
146 |
|
147 |
-
def radar_chart(categories, Selected_model_turn1, Selected_model_turn2,
|
148 |
#categories = categories.split(',')
|
149 |
-
|
150 |
-
Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
|
151 |
-
Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]
|
152 |
Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
|
153 |
Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
|
|
|
|
|
154 |
|
155 |
values_lists = [
|
156 |
-
list(map(float, Selected_model_turn1)),
|
157 |
-
list(map(float, Selected_model_turn2)),
|
158 |
list(map(float, Top1_turn1)),
|
159 |
-
list(map(float, Top1_turn2))
|
|
|
|
|
160 |
]
|
161 |
|
|
|
|
|
|
|
|
|
|
|
162 |
fig = go.Figure()
|
163 |
|
164 |
for i, values in enumerate(values_lists):
|
165 |
if len(categories) != len(values):
|
166 |
return f"Error in dataset {i+1}: Number of categories and values must be the same."
|
167 |
-
|
168 |
fig.add_trace(go.Scatterpolar(
|
169 |
r=values + [values[0]], # Closing the loop of the radar chart
|
170 |
theta=categories + [categories[0]], # Closing the loop of the radar chart
|
171 |
mode='lines',
|
172 |
-
name=category_labels[i] # Label for the dataset
|
|
|
173 |
))
|
174 |
|
175 |
fig.update_layout(
|
@@ -185,63 +189,82 @@ def radar_chart(categories, Selected_model_turn1, Selected_model_turn2, Top1_tur
|
|
185 |
)
|
186 |
),
|
187 |
showlegend=True,
|
188 |
-
width=
|
189 |
-
height=
|
190 |
margin=dict(l=1000, r=20, t=20, b=20),
|
191 |
-
autosize = False,
|
192 |
paper_bgcolor='white',
|
193 |
-
plot_bgcolor='lightgrey'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
)
|
195 |
return fig
|
196 |
|
197 |
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
198 |
-
condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] ==
|
199 |
-
|
|
|
|
|
|
|
200 |
|
201 |
-
|
202 |
-
|
203 |
|
204 |
-
|
205 |
-
|
206 |
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
209 |
|
210 |
-
fig = radar_chart(CATEGORIES, openai_turn1, openai_turn2,
|
211 |
return fig
|
212 |
|
213 |
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
214 |
-
condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] ==
|
215 |
-
|
216 |
|
217 |
-
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] ==
|
218 |
-
|
219 |
|
220 |
-
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] ==
|
221 |
-
|
222 |
|
223 |
-
condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] ==
|
224 |
-
|
225 |
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
return fig
|
228 |
|
229 |
|
230 |
#gradio
|
231 |
with gr.Blocks() as demo:
|
232 |
gr.Markdown("")
|
233 |
-
gr.Markdown("# ๐
|
|
|
234 |
gr.Markdown("")
|
235 |
-
gr.Markdown("#### The Ko-
|
236 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
237 |
-
gr.Markdown("-
|
238 |
-
gr.Markdown("-
|
|
|
239 |
gr.Markdown("")
|
240 |
-
gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
|
241 |
-
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
242 |
gr.Markdown("")
|
243 |
|
244 |
-
with gr.TabItem("
|
245 |
gr.Dataframe(value=df_full_rs)
|
246 |
with gr.TabItem("Openai Judgment"):
|
247 |
gr.Dataframe(value=df_openai)
|
@@ -257,7 +280,6 @@ with gr.Blocks() as demo:
|
|
257 |
with gr.Row():
|
258 |
plot_openai = gr.Plot(label="Openai Plot")
|
259 |
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
|
260 |
-
#with gr.Row():
|
261 |
plot_keval = gr.Plot(label="Keval Plot")
|
262 |
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
|
263 |
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
|
|
4 |
import random
|
5 |
import plotly.graph_objects as go
|
6 |
|
|
|
61 |
|
62 |
# dataframe_full
|
63 |
df_full_rs = df_rs.copy()
|
64 |
+
df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True)
|
65 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
66 |
|
67 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
|
|
69 |
df_full_rs = df_full_rs.round(2)
|
70 |
df_full_rs.replace("", np.nan, inplace=True)
|
71 |
|
72 |
+
df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval ์ด ์ถ๊ฐ
|
73 |
+
df_full_rs['KO-Bench/keval'] = ''
|
74 |
for idx, j_model in df_full_rs['judge_model'].items():
|
75 |
if j_model == 'keval':
|
76 |
+
df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench']
|
77 |
else :
|
78 |
+
df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench']
|
79 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
80 |
|
81 |
+
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval ํ ํฉ๋ณ
|
82 |
df_full_rs = df_full_rs.round(2)
|
83 |
df_full_rs.replace("", np.nan, inplace=True)
|
84 |
|
|
|
92 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
93 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
94 |
|
95 |
+
df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False)
|
96 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
97 |
+
df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
|
98 |
|
99 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
100 |
|
|
|
134 |
# model detail view
|
135 |
plot_models_list = plot_models.tolist()
|
136 |
CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
|
137 |
+
colors_openai = ['#ff0000', '#ff1493', '#115e02', '#21ad05']
|
138 |
+
colors_keval = ['#ff0000', '#ff1493', '#0000ff', '#0592eb']
|
139 |
random.seed(42)
|
140 |
|
141 |
def search_dataframe(query): # df ๊ฒ์ ํจ์ ์ ์
|
|
|
144 |
filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
|
145 |
return filtered_df
|
146 |
|
147 |
+
def radar_chart(categories, Top1_turn1, Top1_turn2, Selected_model_turn1, Selected_model_turn2, category_labels, str): # plot ๊ทธ๋ฆฌ๋ ํจ์
|
148 |
#categories = categories.split(',')
|
|
|
|
|
|
|
149 |
Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
|
150 |
Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
|
151 |
+
Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
|
152 |
+
Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]
|
153 |
|
154 |
values_lists = [
|
|
|
|
|
155 |
list(map(float, Top1_turn1)),
|
156 |
+
list(map(float, Top1_turn2)),
|
157 |
+
list(map(float, Selected_model_turn1)),
|
158 |
+
list(map(float, Selected_model_turn2))
|
159 |
]
|
160 |
|
161 |
+
if str == "openai": colors = colors_openai
|
162 |
+
else: colors = colors_keval
|
163 |
+
if str == "openai": title_text = "< Openai >"
|
164 |
+
else: title_text = "< Keval >"
|
165 |
+
|
166 |
fig = go.Figure()
|
167 |
|
168 |
for i, values in enumerate(values_lists):
|
169 |
if len(categories) != len(values):
|
170 |
return f"Error in dataset {i+1}: Number of categories and values must be the same."
|
|
|
171 |
fig.add_trace(go.Scatterpolar(
|
172 |
r=values + [values[0]], # Closing the loop of the radar chart
|
173 |
theta=categories + [categories[0]], # Closing the loop of the radar chart
|
174 |
mode='lines',
|
175 |
+
name=category_labels[i], # Label for the dataset
|
176 |
+
line = dict(color= colors[i])
|
177 |
))
|
178 |
|
179 |
fig.update_layout(
|
|
|
189 |
)
|
190 |
),
|
191 |
showlegend=True,
|
192 |
+
#width=650, # ์ ์ ํ ๋๋น ์ค์
|
193 |
+
#height=650, # ์ ์ ํ ๋์ด ์ค์
|
194 |
margin=dict(l=1000, r=20, t=20, b=20),
|
195 |
+
#autosize = False,
|
196 |
paper_bgcolor='white',
|
197 |
+
plot_bgcolor='lightgrey',
|
198 |
+
title=dict(
|
199 |
+
text=title_text, # ์ ๋ชฉ์ ์ํ๋ ํ
์คํธ๋ก ๋ณ๊ฒฝ
|
200 |
+
x=0.5, # ์ ๋ชฉ์ x ์์น (0=์ผ์ชฝ, 0.5=์ค์, 1=์ค๋ฅธ์ชฝ)
|
201 |
+
xanchor='center', # ์ ๋ชฉ์ x ์์น ๊ธฐ์ค (center, left, right)
|
202 |
+
y=0.95, # ์ ๋ชฉ์ y ์์น (0=ํ๋จ, 1=์๋จ)
|
203 |
+
yanchor='top' # ์ ๋ชฉ์ y ์์น ๊ธฐ์ค (top, middle, bottom)
|
204 |
+
)
|
205 |
)
|
206 |
return fig
|
207 |
|
208 |
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
209 |
+
condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.iat[0, df_openai.columns.get_loc('model')])
|
210 |
+
top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
211 |
+
|
212 |
+
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.iat[0, df_openai.columns.get_loc('model')])
|
213 |
+
top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
214 |
|
215 |
+
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
216 |
+
openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
|
217 |
|
218 |
+
condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
|
219 |
+
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
220 |
|
221 |
+
category_labels = []
|
222 |
+
category_labels.append(df_openai.iat[0, df_openai.columns.get_loc('model')] + " /Turn 1")
|
223 |
+
category_labels.append(df_openai.iat[0, df_openai.columns.get_loc('model')] + " /Turn 2")
|
224 |
+
category_labels.append(dropdown_model + " /Turn 1")
|
225 |
+
category_labels.append(dropdown_model + " /Turn 2")
|
226 |
|
227 |
+
fig = radar_chart(CATEGORIES, top1_openai_turn1, top1_openai_turn2, openai_turn1, openai_turn2, category_labels,"openai")
|
228 |
return fig
|
229 |
|
230 |
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
231 |
+
condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.iat[0, df_keval.columns.get_loc('model')])
|
232 |
+
top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
233 |
|
234 |
+
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.iat[0, df_keval.columns.get_loc('model')])
|
235 |
+
top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
236 |
|
237 |
+
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
238 |
+
keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
|
239 |
|
240 |
+
condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
|
241 |
+
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
242 |
|
243 |
+
category_labels = []
|
244 |
+
category_labels.append(df_keval.iat[0, df_keval.columns.get_loc('model')] + " /Turn 1")
|
245 |
+
category_labels.append(df_keval.iat[0, df_keval.columns.get_loc('model')] + " /Turn 2")
|
246 |
+
category_labels.append(dropdown_model + " /Turn 1")
|
247 |
+
category_labels.append(dropdown_model + " /Turn 2")
|
248 |
+
|
249 |
+
fig = radar_chart(CATEGORIES, top1_keval_turn1, top1_keval_turn2, keval_turn1, keval_turn2, category_labels, "keval")
|
250 |
return fig
|
251 |
|
252 |
|
253 |
#gradio
|
254 |
with gr.Blocks() as demo:
|
255 |
gr.Markdown("")
|
256 |
+
gr.Markdown("# ๐ KO-Bench Leaderboard")
|
257 |
+
gr.Markdown("")
|
258 |
gr.Markdown("")
|
259 |
+
gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
260 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
261 |
+
gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
|
262 |
+
gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
|
263 |
+
gr.Markdown("")
|
264 |
gr.Markdown("")
|
|
|
|
|
265 |
gr.Markdown("")
|
266 |
|
267 |
+
with gr.TabItem("KO-Bench"):
|
268 |
gr.Dataframe(value=df_full_rs)
|
269 |
with gr.TabItem("Openai Judgment"):
|
270 |
gr.Dataframe(value=df_openai)
|
|
|
280 |
with gr.Row():
|
281 |
plot_openai = gr.Plot(label="Openai Plot")
|
282 |
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
|
|
|
283 |
plot_keval = gr.Plot(label="Keval Plot")
|
284 |
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
|
285 |
|
ko_bench.csv
CHANGED
@@ -5,6 +5,7 @@ gpt-4o,1,GPT-4o-mini-2024-07-18,8.8,7.3,9.2,9.4,10.0,6.9,8.7,9.6,9.1
|
|
5 |
gpt-4o,1,Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
|
6 |
gpt-4o,1,Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
|
7 |
gpt-4o,1,gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
|
|
|
8 |
gpt-4o,1,ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
|
9 |
gpt-4o,1,gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
|
10 |
gpt-4o,1,WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
|
@@ -24,6 +25,7 @@ gpt-4o,2,gpt-4-0125-preview,8.0,7.2,8.5,8.9,6.8,7.3,8.7,8.1,8.6
|
|
24 |
gpt-4o,2,GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
|
25 |
gpt-4o,2,Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
|
26 |
gpt-4o,2,gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
|
|
|
27 |
gpt-4o,2,Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
|
28 |
gpt-4o,2,ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
|
29 |
gpt-4o,2,WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
|
@@ -43,6 +45,7 @@ keval,1,GPT-4o-2024-05-13,9.1,7.8,9.5,9.6,9.9,8.8,8.7,9.3,9.2
|
|
43 |
keval,1,gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
|
44 |
keval,1,GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
|
45 |
keval,1,Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
|
|
|
46 |
keval,1,gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
|
47 |
keval,1,Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
|
48 |
keval,1,ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
|
@@ -66,6 +69,7 @@ keval,2,Mistral-Large-Instruct-2407,7.0,5.4,7.3,8.5,7.3,5.2,7.9,7.8,6.9
|
|
66 |
keval,2,Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
|
67 |
keval,2,gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
|
68 |
keval,2,WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
|
|
|
69 |
keval,2,ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
|
70 |
keval,2,gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
|
71 |
keval,2,EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
|
|
|
5 |
gpt-4o,1,Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
|
6 |
gpt-4o,1,Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
|
7 |
gpt-4o,1,gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
|
8 |
+
gpt-4o,1,gemini-1.5-pro,8.2,5.5,9.7,8.7,7.5,6.5,9.1,9.4,9.2
|
9 |
gpt-4o,1,ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
|
10 |
gpt-4o,1,gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
|
11 |
gpt-4o,1,WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
|
|
|
25 |
gpt-4o,2,GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
|
26 |
gpt-4o,2,Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
|
27 |
gpt-4o,2,gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
|
28 |
+
gpt-4o,2,gemini-1.5-pro,7.0,6.3,7.7,8.3,6.1,5.0,8.5,7.8,6.5
|
29 |
gpt-4o,2,Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
|
30 |
gpt-4o,2,ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
|
31 |
gpt-4o,2,WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
|
|
|
45 |
keval,1,gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
|
46 |
keval,1,GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
|
47 |
keval,1,Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
|
48 |
+
keval,1,gemini-1.5-pro,8.2,5.7,9.8,8.8,7.4,6.2,9.1,9.7,9.0
|
49 |
keval,1,gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
|
50 |
keval,1,Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
|
51 |
keval,1,ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
|
|
|
69 |
keval,2,Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
|
70 |
keval,2,gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
|
71 |
keval,2,WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
|
72 |
+
keval,2,gemini-1.5-pro,6.5,5.2,6.9,8.4,6.0,4.8,8.1,7.3,5.4
|
73 |
keval,2,ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
|
74 |
keval,2,gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
|
75 |
keval,2,EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
|