lukecq commited on
Commit
c69a5b0
1 Parent(s): d31b48d
app.py CHANGED
@@ -34,13 +34,16 @@ def restart_space():
34
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
35
 
36
  all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
37
- show_columns = ['R','type', 'Model', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
 
38
  # Load the data from the csv file
39
  csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
40
  df_m3exam, df_mmlu, df_avg = load_data(csv_path)
41
- df_m3exam = df_m3exam.copy()[show_columns]
42
- df_mmlu = df_mmlu.copy()[show_columns]
43
  df_avg_init = df_avg.copy()[df_avg['type'] == '🔶 chat'][show_columns]
 
 
44
 
45
  # data_types = ['number', 'str', 'markdown','str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
46
  # map_columns = {'rank':'R','type':'type', 'Model':'Model','open?':'open?', 'avg_sea':'avg_sea ⬇️', 'en':'en', 'zh':'zh', 'id':'id', 'th':'th', 'vi':'vi', 'avg':'avg', 'params':'params(B)'}
@@ -143,10 +146,10 @@ with demo:
143
  # + [AutoEvalColumn.dummy.name]
144
  # ],
145
  # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
146
- # datatype=TYPES,
147
  elem_id="leaderboard-table",
148
  interactive=False,
149
- datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
150
  # datatype=[map_types[k] for k in shown_columns.value],
151
  visible=True,
152
  # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
@@ -194,17 +197,35 @@ with demo:
194
  )
195
  with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
196
  with gr.Row():
197
- search_bar = gr.Textbox(
198
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
199
- show_label=False,
200
- elem_id="search-bar",
201
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  leaderboard_table = gr.components.Dataframe(
204
- value=df_m3exam,
205
  interactive=False,
206
  visible=True,
207
- datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
 
208
  )
209
 
210
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
@@ -216,31 +237,56 @@ with demo:
216
  search_bar.submit(
217
  update_table,
218
  [
219
- # df_avg,
220
- hidden_leaderboard_table_for_search,
221
- # shown_columns,
222
- # filter_columns_type,
223
- # filter_columns_precision,
224
- # filter_columns_size,
225
- # deleted_models_visibility,
226
  search_bar,
227
  ],
228
  leaderboard_table,
229
  )
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  with gr.TabItem("MMLU", elem_id="llm-benchmark-MMLU", id=2):
232
  with gr.Row():
233
- search_bar = gr.Textbox(
234
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
235
- show_label=False,
236
- elem_id="search-bar",
237
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  leaderboard_table = gr.components.Dataframe(
240
- value=df_mmlu,
241
  interactive=False,
242
  visible=True,
243
- datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
 
244
  )
245
 
246
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
@@ -252,17 +298,24 @@ with demo:
252
  search_bar.submit(
253
  update_table,
254
  [
255
- # df_avg,
256
- hidden_leaderboard_table_for_search,
257
- # shown_columns,
258
- # filter_columns_type,
259
- # filter_columns_precision,
260
- # filter_columns_size,
261
- # deleted_models_visibility,
262
  search_bar,
263
  ],
264
  leaderboard_table,
265
  )
 
 
 
 
 
 
 
 
 
 
 
266
 
267
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
268
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
34
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
35
 
36
  all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
37
+ show_columns = ['R', 'Model','type','open?','params(B)', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', ]
38
+ TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
39
  # Load the data from the csv file
40
  csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
41
  df_m3exam, df_mmlu, df_avg = load_data(csv_path)
42
+ # df_m3exam = df_m3exam.copy()[show_columns]
43
+ # df_mmlu = df_mmlu.copy()[show_columns]
44
  df_avg_init = df_avg.copy()[df_avg['type'] == '🔶 chat'][show_columns]
45
+ df_m3exam_init = df_m3exam.copy()[df_m3exam['type'] == '🔶 chat'][show_columns]
46
+ df_mmlu_init = df_mmlu.copy()[df_mmlu['type'] == '🔶 chat'][show_columns]
47
 
48
  # data_types = ['number', 'str', 'markdown','str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
49
  # map_columns = {'rank':'R','type':'type', 'Model':'Model','open?':'open?', 'avg_sea':'avg_sea ⬇️', 'en':'en', 'zh':'zh', 'id':'id', 'th':'th', 'vi':'vi', 'avg':'avg', 'params':'params(B)'}
 
146
  # + [AutoEvalColumn.dummy.name]
147
  # ],
148
  # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
149
+ datatype=TYPES,
150
  elem_id="leaderboard-table",
151
  interactive=False,
152
+ # datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
153
  # datatype=[map_types[k] for k in shown_columns.value],
154
  visible=True,
155
  # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
 
197
  )
198
  with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
199
  with gr.Row():
200
+ with gr.Column():
201
+ search_bar = gr.Textbox(
202
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
203
+ show_label=False,
204
+ elem_id="search-bar",
205
+ )
206
+ with gr.Column():
207
+ type_query = gr.CheckboxGroup(
208
+ choices=["🟢 base", "🔶 chat"],
209
+ value=["🔶 chat" ],
210
+ label="model types to show",
211
+ elem_id="type-select",
212
+ interactive=True,
213
+ )
214
+ with gr.Column():
215
+ open_query = gr.CheckboxGroup(
216
+ choices=["open", "closed"],
217
+ value=["open", "closed"],
218
+ label="open-source or closed-source models?",
219
+ elem_id="open-select",
220
+ interactive=True,
221
+ )
222
 
223
  leaderboard_table = gr.components.Dataframe(
224
+ value=df_m3exam_init,
225
  interactive=False,
226
  visible=True,
227
+ # datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
228
+ datatype=TYPES,
229
  )
230
 
231
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
 
237
  search_bar.submit(
238
  update_table,
239
  [
240
+ hidden_leaderboard_table_for_search,
241
+ type_query,
242
+ open_query,
 
 
 
 
243
  search_bar,
244
  ],
245
  leaderboard_table,
246
  )
247
+ for selector in [type_query, open_query]:
248
+ selector.change(
249
+ update_table,
250
+ [
251
+ hidden_leaderboard_table_for_search,
252
+ type_query,
253
+ open_query,
254
+ search_bar,
255
+ ],
256
+ leaderboard_table,
257
+ )
258
 
259
  with gr.TabItem("MMLU", elem_id="llm-benchmark-MMLU", id=2):
260
  with gr.Row():
261
+ with gr.Column():
262
+ search_bar = gr.Textbox(
263
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
264
+ show_label=False,
265
+ elem_id="search-bar",
266
+ )
267
+ with gr.Column():
268
+ type_query = gr.CheckboxGroup(
269
+ choices=["🟢 base", "🔶 chat"],
270
+ value=["🔶 chat" ],
271
+ label="model types to show",
272
+ elem_id="type-select",
273
+ interactive=True,
274
+ )
275
+ with gr.Column():
276
+ open_query = gr.CheckboxGroup(
277
+ choices=["open", "closed"],
278
+ value=["open", "closed"],
279
+ label="open-source or closed-source models?",
280
+ elem_id="open-select",
281
+ interactive=True,
282
+ )
283
 
284
  leaderboard_table = gr.components.Dataframe(
285
+ value=df_mmlu_init,
286
  interactive=False,
287
  visible=True,
288
+ # datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
289
+ datatype=TYPES,
290
  )
291
 
292
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
 
298
  search_bar.submit(
299
  update_table,
300
  [
301
+ hidden_leaderboard_table_for_search,
302
+ type_query,
303
+ open_query,
 
 
 
 
304
  search_bar,
305
  ],
306
  leaderboard_table,
307
  )
308
+ for selector in [type_query, open_query]:
309
+ selector.change(
310
+ update_table,
311
+ [
312
+ hidden_leaderboard_table_for_search,
313
+ type_query,
314
+ open_query,
315
+ search_bar,
316
+ ],
317
+ leaderboard_table,
318
+ )
319
 
320
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
321
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
eval-results/.gitattributes DELETED
@@ -1,55 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.lz4 filter=lfs diff=lfs merge=lfs -text
12
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
- *.model filter=lfs diff=lfs merge=lfs -text
14
- *.msgpack filter=lfs diff=lfs merge=lfs -text
15
- *.npy filter=lfs diff=lfs merge=lfs -text
16
- *.npz filter=lfs diff=lfs merge=lfs -text
17
- *.onnx filter=lfs diff=lfs merge=lfs -text
18
- *.ot filter=lfs diff=lfs merge=lfs -text
19
- *.parquet filter=lfs diff=lfs merge=lfs -text
20
- *.pb filter=lfs diff=lfs merge=lfs -text
21
- *.pickle filter=lfs diff=lfs merge=lfs -text
22
- *.pkl filter=lfs diff=lfs merge=lfs -text
23
- *.pt filter=lfs diff=lfs merge=lfs -text
24
- *.pth filter=lfs diff=lfs merge=lfs -text
25
- *.rar filter=lfs diff=lfs merge=lfs -text
26
- *.safetensors filter=lfs diff=lfs merge=lfs -text
27
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
- *.tar.* filter=lfs diff=lfs merge=lfs -text
29
- *.tar filter=lfs diff=lfs merge=lfs -text
30
- *.tflite filter=lfs diff=lfs merge=lfs -text
31
- *.tgz filter=lfs diff=lfs merge=lfs -text
32
- *.wasm filter=lfs diff=lfs merge=lfs -text
33
- *.xz filter=lfs diff=lfs merge=lfs -text
34
- *.zip filter=lfs diff=lfs merge=lfs -text
35
- *.zst filter=lfs diff=lfs merge=lfs -text
36
- *tfevents* filter=lfs diff=lfs merge=lfs -text
37
- # Audio files - uncompressed
38
- *.pcm filter=lfs diff=lfs merge=lfs -text
39
- *.sam filter=lfs diff=lfs merge=lfs -text
40
- *.raw filter=lfs diff=lfs merge=lfs -text
41
- # Audio files - compressed
42
- *.aac filter=lfs diff=lfs merge=lfs -text
43
- *.flac filter=lfs diff=lfs merge=lfs -text
44
- *.mp3 filter=lfs diff=lfs merge=lfs -text
45
- *.ogg filter=lfs diff=lfs merge=lfs -text
46
- *.wav filter=lfs diff=lfs merge=lfs -text
47
- # Image files - uncompressed
48
- *.bmp filter=lfs diff=lfs merge=lfs -text
49
- *.gif filter=lfs diff=lfs merge=lfs -text
50
- *.png filter=lfs diff=lfs merge=lfs -text
51
- *.tiff filter=lfs diff=lfs merge=lfs -text
52
- # Image files - compressed
53
- *.jpg filter=lfs diff=lfs merge=lfs -text
54
- *.jpeg filter=lfs diff=lfs merge=lfs -text
55
- *.webp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/README.md DELETED
@@ -1,24 +0,0 @@
1
- ---
2
- license: apache-2.0
3
- language:
4
- - en
5
- - zh
6
- - vi
7
- - id
8
- - th
9
-
10
- size_categories:
11
- - n<1K
12
- configs:
13
- - config_name: results
14
- data_files: SeaExam_results.csv
15
- ---
16
-
17
- # About
18
-
19
- This repo contains the original results for the space [SeaExam Leaderboard](https://huggingface.co/spaces/SeaLLMs/SeaExam_leaderboard).
20
-
21
- To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
22
- ```python
23
- python scripts/main.py --model $model_name_or_path
24
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/SeaExam_results.csv DELETED
@@ -1,47 +0,0 @@
1
- ,,,,M3Exam,,,,,,,MMLU,,,,,,,AVG,,,,,,
2
- Model,type,open?,shot,en,zh,id,th,vi,avg,avg_sea,en,zh,id,th,vi,avg,avg_sea,en,zh,id,th,vi,avg,avg_sea
3
- gpt-4-turbo-1106,chat,N,0,0.87683,0.78882,0.64873,0.68956,0.70774,0.74234,0.68201,0.79825,0.72912,0.74526,0.67088,0.71053,0.73081,0.70889,0.83754,0.75897,0.69700,0.68022,0.70913,0.73657,0.69545
4
- Meta-Llama-3-70B,base,Y,3,0.84382,0.75621,0.61899,0.66181,0.68252,0.71267,0.65444,0.78772,0.70491,0.73509,0.65930,0.70526,0.71846,0.69988,0.81577,0.73056,0.67704,0.66055,0.69389,0.71556,0.67716
5
- Meta-Llama-3-70B-Instruct,chat,Y,3,0.86321,0.69410,0.62975,0.64299,0.68424,0.70286,0.65233,0.79965,0.69088,0.72316,0.63228,0.68912,0.70702,0.68152,0.83143,0.69249,0.67645,0.63764,0.68668,0.70494,0.66692
6
- Qwen1.5-72B,base,Y,3,0.83857,0.92547,0.58734,0.56820,0.64756,0.71343,0.60104,0.74491,0.69474,0.66456,0.56351,0.63088,0.65972,0.61965,0.79174,0.81010,0.62595,0.56586,0.63922,0.68657,0.61034
7
- claude-3-sonnet-20240229,chat,N,0,0.78878,0.68323,0.58544,0.57150,0.62579,0.65095,0.59424,0.71333,0.60456,0.63684,0.54070,0.58421,0.61593,0.58725,0.75106,0.64390,0.61114,0.55610,0.60500,0.63344,0.59075
8
- claude-3-haiku-20240307,chat,N,0,0.79036,0.65217,0.56266,0.57291,0.63095,0.64181,0.58884,0.71053,0.60526,0.61193,0.51474,0.56316,0.60112,0.56327,0.75044,0.62872,0.58729,0.54382,0.59705,0.62147,0.57606
9
- dbrx-base,base,Y,3,0.80818,0.68944,0.53418,0.50659,0.60458,0.62859,0.54845,0.73123,0.64281,0.64456,0.47368,0.61754,0.62196,0.57860,0.76970,0.66612,0.58937,0.49013,0.61106,0.62528,0.56352
10
- Mixtral-8x22B-v0.1,base,Y,3,0.83910,0.69565,0.56962,0.48730,0.60115,0.63856,0.55269,0.76877,0.62491,0.64667,0.45018,0.57649,0.61340,0.55778,0.80394,0.66028,0.60814,0.46874,0.58882,0.62598,0.55523
11
- SeaLLM-7B-v2.5,chat,Y,3,0.75943,0.60248,0.50063,0.50659,0.61834,0.59749,0.54185,0.64877,0.53719,0.56772,0.48667,0.53018,0.55411,0.52819,0.70410,0.56984,0.53418,0.49663,0.57426,0.57580,0.53502
12
- Qwen1.5-14B,base,Y,3,0.79665,0.86180,0.52722,0.47836,0.54900,0.64260,0.51819,0.67509,0.60211,0.55719,0.44491,0.52351,0.56056,0.50854,0.73587,0.73195,0.54220,0.46164,0.53625,0.60158,0.51336
13
- gemini-1.0-pro,chat,N,0,0.56866,0.72516,0.43987,0.49247,0.60516,0.56626,0.51250,0.54912,0.59684,0.53368,0.43895,0.55298,0.53432,0.50854,0.55889,0.66100,0.48678,0.46571,0.57907,0.55029,0.51052
14
- gemma-7b,base,Y,3,0.73061,0.52795,0.46456,0.46284,0.59656,0.55650,0.50799,0.63579,0.50772,0.55228,0.48842,0.49684,0.53621,0.51251,0.68320,0.51783,0.50842,0.47563,0.54670,0.54636,0.51025
15
- gpt-3.5-turbo-0125,,N,3,0.75105,0.58851,0.50000,0.38852,0.53352,0.55232,0.47402,0.68211,0.54912,0.59088,0.38596,0.50246,0.54211,0.49310,0.71658,0.56882,0.54544,0.38724,0.51799,0.54721,0.48356
16
- Mixtral-8x7B-v0.1,base,Y,3,0.77096,0.60559,0.47975,0.43509,0.52206,0.56269,0.47897,0.70351,0.54140,0.56632,0.39298,0.49404,0.53965,0.48444,0.73724,0.57350,0.52303,0.41404,0.50805,0.55117,0.48171
17
- Llama-2-70b-hf,base,Y,3,0.74895,0.59938,0.49177,0.34478,0.55931,0.54884,0.46529,0.68526,0.55965,0.58982,0.32737,0.52035,0.53649,0.47918,0.71711,0.57951,0.54080,0.33607,0.53983,0.54267,0.47223
18
- Meta-Llama-3-8B,base,Y,3,0.70021,0.54037,0.42722,0.45390,0.50888,0.52612,0.46333,0.63193,0.48561,0.51158,0.43579,0.49053,0.51109,0.47930,0.66607,0.51299,0.46940,0.44485,0.49970,0.51860,0.47132
19
- Sailor-7B-Chat,chat,Y,3,0.65618,0.65062,0.47405,0.46425,0.51175,0.55137,0.48335,0.55579,0.47509,0.48526,0.41789,0.46105,0.47902,0.45474,0.60599,0.56285,0.47966,0.44107,0.48640,0.51519,0.46904
20
- gpt-3.5-turbo-0125,chat,N,0,0.75577,0.60559,0.49304,0.39652,0.52894,0.55597,0.47283,0.67228,0.53018,0.56667,0.36070,0.46281,0.51853,0.46339,0.71402,0.56788,0.52985,0.37861,0.49587,0.53725,0.46811
21
- Yi-34B,base,Y,3,0.81499,0.86025,0.54114,0.38147,0.50201,0.61997,0.47487,0.75860,0.68386,0.60105,0.31439,0.45018,0.56161,0.45520,0.78679,0.77205,0.57110,0.34793,0.47609,0.59079,0.46504
22
- Meta-Llama-3-8B-Instruct,chat,Y,3,0.72537,0.53727,0.46646,0.37065,0.50946,0.52184,0.44885,0.64912,0.48246,0.50421,0.36702,0.47544,0.49565,0.44889,0.68724,0.50986,0.48533,0.36883,0.49245,0.50874,0.44887
23
- SeaLLM-7B-v2,chat,Y,3,0.70178,0.51553,0.43165,0.40593,0.51519,0.51401,0.45092,0.61474,0.45930,0.49158,0.36246,0.44246,0.47411,0.43216,0.65826,0.48741,0.46161,0.38419,0.47882,0.49406,0.44154
24
- Sailor-7B,base,Y,3,0.61111,0.63199,0.44304,0.40969,0.49914,0.51899,0.45062,0.52456,0.44737,0.45614,0.40070,0.43754,0.45326,0.43146,0.56784,0.53968,0.44959,0.40520,0.46834,0.48613,0.44104
25
- Qwen1.5-7B-Chat,chat,Y,3,0.64570,0.62733,0.43038,0.39793,0.49226,0.51872,0.44019,0.58351,0.51579,0.42772,0.36316,0.44667,0.46737,0.41251,0.61461,0.57156,0.42905,0.38054,0.46947,0.49304,0.42635
26
- Yi-9B,base,Y,3,0.77516,0.79193,0.49241,0.35748,0.45330,0.57405,0.43439,0.67684,0.59263,0.50772,0.29404,0.38140,0.49053,0.39439,0.72600,0.69228,0.50006,0.32576,0.41735,0.53229,0.41439
27
- Qwen1.5-7B,base,Y,3,0.72117,0.81056,0.44114,0.36124,0.44986,0.55679,0.41741,0.61228,0.51509,0.45895,0.34105,0.41333,0.46814,0.40444,0.66673,0.66282,0.45004,0.35115,0.43159,0.51247,0.41093
28
- Mistral-7B-v0.1,base,Y,3,0.67715,0.49689,0.42152,0.34572,0.40860,0.46998,0.39194,0.60877,0.45754,0.47053,0.31579,0.40351,0.45123,0.39661,0.64296,0.47722,0.44602,0.33075,0.40605,0.46060,0.39428
29
- gemma-7b-it,chat,Y,3,0.62159,0.42702,0.37342,0.32079,0.46705,0.44197,0.38709,0.52421,0.42632,0.41719,0.34456,0.39298,0.42105,0.38491,0.57290,0.42667,0.39531,0.33268,0.43002,0.43151,0.38600
30
- Mistral-7B-Instruct-v0.2,chat,Y,3,0.65671,0.49534,0.40443,0.30386,0.39885,0.45184,0.36905,0.58877,0.43404,0.44246,0.32596,0.38211,0.43467,0.38351,0.62274,0.46469,0.42344,0.31491,0.39048,0.44325,0.37628
31
- Qwen1.5-4B,base,Y,3,0.66352,0.77174,0.35127,0.31891,0.38854,0.49879,0.35290,0.55018,0.46807,0.39298,0.31193,0.36947,0.41853,0.35813,0.60685,0.61990,0.37212,0.31542,0.37901,0.45866,0.35552
32
- Yi-6B,base,Y,3,0.70440,0.80901,0.41076,0.29821,0.37020,0.51852,0.35972,0.62175,0.54316,0.43825,0.26140,0.33368,0.43965,0.34444,0.66308,0.67608,0.42450,0.27981,0.35194,0.47908,0.35208
33
- Llama-2-13b-hf,base,Y,3,0.60535,0.36491,0.38418,0.28786,0.40860,0.41018,0.36021,0.53368,0.38877,0.42421,0.24175,0.36386,0.39046,0.34327,0.56952,0.37684,0.40419,0.26481,0.38623,0.40032,0.35174
34
- Llama-2-13b-chat-hf,chat,Y,3,0.58910,0.38199,0.37152,0.28833,0.38968,0.40412,0.34985,0.53088,0.38281,0.40351,0.25789,0.34561,0.38414,0.33567,0.55999,0.38240,0.38751,0.27311,0.36765,0.39413,0.34276
35
- Qwen1.5-MoE-A2.7B,base,Y,3,0.62788,0.78882,0.36582,0.25400,0.40172,0.48765,0.34051,0.56456,0.49123,0.40772,0.26070,0.31684,0.40821,0.32842,0.59622,0.64002,0.38677,0.25735,0.35928,0.44793,0.33447
36
- gemma-2b-it,chat,Y,3,0.43868,0.37733,0.31646,0.28363,0.35702,0.35462,0.31904,0.37789,0.33614,0.33930,0.30526,0.32035,0.33579,0.32164,0.40829,0.35673,0.32788,0.29445,0.33869,0.34521,0.32034
37
- Llama-2-7b-chat-hf,chat,Y,3,0.56604,0.32609,0.34114,0.26811,0.34040,0.36835,0.31655,0.48211,0.35509,0.35789,0.25684,0.33298,0.35698,0.31591,0.52407,0.34059,0.34952,0.26248,0.33669,0.36267,0.31623
38
- bloomz-7b1,chat,Y,3,0.43082,0.37733,0.36139,0.25588,0.35645,0.35637,0.32457,0.36561,0.34386,0.32526,0.22386,0.31684,0.31509,0.28865,0.39822,0.36059,0.34333,0.23987,0.33664,0.33573,0.30661
39
- gemma-2b,base,Y,3,0.41719,0.27484,0.30443,0.28645,0.31576,0.31974,0.30221,0.37860,0.30281,0.31474,0.30070,0.30667,0.32070,0.30737,0.39789,0.28883,0.30958,0.29358,0.31121,0.32022,0.30479
40
- Llama-2-7b-hf,base,Y,3,0.49109,0.32298,0.30823,0.26341,0.31748,0.34064,0.29637,0.44982,0.33439,0.34421,0.25930,0.30877,0.33930,0.30409,0.47046,0.32868,0.32622,0.26135,0.31313,0.33997,0.30023
41
- Qwen1.5-1.8B,base,Y,3,0.54612,0.71273,0.32595,0.24365,0.32378,0.43045,0.29779,0.46211,0.39018,0.32702,0.24456,0.32281,0.34933,0.29813,0.50411,0.55145,0.32648,0.24411,0.32329,0.38989,0.29796
42
- Qwen1.5-0.5B,base,Y,3,0.44602,0.61025,0.29367,0.26011,0.29742,0.38149,0.28373,0.38737,0.32421,0.29649,0.28456,0.29965,0.31846,0.29357,0.41669,0.46723,0.29508,0.27234,0.29854,0.34998,0.28865
43
- sea-lion-7b-instruct,chat,Y,3,0.26992,0.27329,0.28671,0.26435,0.26877,0.27261,0.27327,0.26947,0.26070,0.25684,0.26526,0.25474,0.26140,0.25895,0.26969,0.26700,0.27178,0.26480,0.26175,0.26700,0.26611
44
- sea-lion-7b,base,Y,3,0.24476,0.22826,0.25443,0.26435,0.24126,0.24661,0.25335,0.24772,0.26175,0.24982,0.24491,0.26351,0.25354,0.25275,0.24624,0.24501,0.25213,0.25463,0.25238,0.25008,0.25305
45
- phi-2,base,Y,3,0.58176,0.28571,0.29494,0.20978,0.26934,0.32831,0.25802,0.56842,0.29439,0.29333,0.14105,0.26842,0.31312,0.23427,0.57509,0.29005,0.29414,0.17542,0.26888,0.32072,0.24614
46
- bloom-7b1,base,Y,3,0.22694,0.18323,0.25316,0.24036,0.24298,0.22933,0.24550,0.25088,0.23895,0.25158,0.23684,0.24456,0.24456,0.24433,0.23891,0.21109,0.25237,0.23860,0.24377,0.23695,0.24491
47
- claude-3-opus-20240229,chat,N,0,,,0.70316,0.73330,0.74613,,0.72753,,,,,,,,,,,,,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "config": {
3
- "model_dtype": "torch.float16",
4
- "model_name": "demo-leaderboard/gpt2-demo",
5
- "model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
6
- },
7
- "results": {
8
- "task_name1": {
9
- "metric_name": 0
10
- },
11
- "task_name2": {
12
- "metric_name": 0.90
13
- }
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/about.py CHANGED
@@ -22,13 +22,14 @@ TITLE = """<h1 align="center" id="space-title">📃 SeaExam Leaderboard</h1>"""
22
  SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
23
 
24
  # What does your leaderboard evaluate?
 
 
 
 
25
  INTRODUCTION_TEXT = """
26
- This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
27
  """
28
 
29
- # INTRODUCTION_TEXT = """
30
- # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
31
-
32
  # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
33
 
34
  # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
 
22
  SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
23
 
24
  # What does your leaderboard evaluate?
25
+ # INTRODUCTION_TEXT = """
26
+ # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
27
+ # """
28
+
29
  INTRODUCTION_TEXT = """
30
+ This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects). Refer to the "📝 About" tab for more information.
31
  """
32
 
 
 
 
33
  # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
34
 
35
  # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
update_git.sh DELETED
@@ -1,3 +0,0 @@
1
- git add .
2
- git commit -m "update scripts"
3
- git push