binwang commited on
Commit
4687701
1 Parent(s): e90e78a

new format

Browse files
Files changed (1) hide show
  1. app.py +394 -583
app.py CHANGED
@@ -55,12 +55,10 @@ def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
55
  df_list = []
56
 
57
  for model in MODEL_LIST:
58
-
59
-
60
- results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
61
-
62
 
63
  try:
 
 
64
  overall_acc = [results['overall_acc'] for results in results_list]
65
  overall_acc = median(overall_acc)
66
 
@@ -70,20 +68,18 @@ def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
70
  AC3_3 = [results['AC3_3'] for results in results_list]
71
  AC3_3 = median(AC3_3)
72
 
73
- except:
74
- consistency_score_3 = -1
75
- overall_acc = -1
76
- AC3_3 = -1
 
 
 
77
 
78
- res = {
79
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
80
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
81
- "Accuracy": overall_acc,
82
- "Cross-Lingual Consistency": consistency_score_3,
83
- "AC3": AC3_3,
84
- }
85
 
86
- df_list.append(res)
 
87
 
88
 
89
  df = pd.DataFrame(df_list)
@@ -104,7 +100,6 @@ def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
104
 
105
  return df
106
 
107
-
108
  CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
109
  CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")
110
 
@@ -114,12 +109,10 @@ def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True)
114
  df_list = []
115
 
116
  for model in MODEL_LIST:
117
-
118
-
119
- results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
120
-
121
-
122
  try:
 
 
123
  English = [results['language_acc']['English'] for results in results_list]
124
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
125
  Chinese = [results['language_acc']['Chinese'] for results in results_list]
@@ -130,23 +123,19 @@ def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True)
130
  Chinese = median(Chinese)
131
  Spanish = median(Spanish)
132
 
 
 
 
 
 
 
 
 
133
 
134
- except:
135
- English = -1
136
- Vietnamese = -1
137
- Chinese = -1
138
- Spanish = -1
139
-
140
- res = {
141
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
142
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
143
- "English": English,
144
- "Vietnamese": Vietnamese,
145
- "Chinese": Chinese,
146
- "Spanish": Spanish,
147
- }
148
 
149
- df_list.append(res)
 
150
 
151
 
152
  df = pd.DataFrame(df_list)
@@ -167,7 +156,6 @@ def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True)
167
 
168
  return df
169
 
170
-
171
  CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
172
  CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")
173
 
@@ -186,12 +174,11 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
186
  df_list = []
187
 
188
  for model in MODEL_LIST:
189
-
190
-
191
- results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
192
-
193
 
194
  try:
 
 
 
195
  overall_acc = [results['overall_acc'] for results in results_list]
196
  overall_acc = median(overall_acc)
197
 
@@ -201,20 +188,17 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
201
  AC3_3 = [results['AC3_3'] for results in results_list]
202
  AC3_3 = median(AC3_3)
203
 
204
- except:
205
- consistency_score_3 = -1
206
- overall_acc = -1
207
- AC3_3 = -1
208
-
209
- res = {
210
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
211
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
212
- "Accuracy": overall_acc,
213
- "Cross-Lingual Consistency": consistency_score_3,
214
- "AC3": AC3_3,
215
- }
216
 
217
- df_list.append(res)
 
218
 
219
 
220
  df = pd.DataFrame(df_list)
@@ -235,7 +219,6 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
235
 
236
  return df
237
 
238
-
239
  CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
240
  CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")
241
 
@@ -245,12 +228,11 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
245
  df_list = []
246
 
247
  for model in MODEL_LIST:
 
 
248
 
 
249
 
250
- results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
251
-
252
-
253
- try:
254
  English = [results['language_acc']['English'] for results in results_list]
255
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
256
  Chinese = [results['language_acc']['Chinese'] for results in results_list]
@@ -267,30 +249,22 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
267
  Spanish = median(Spanish)
268
  Malay = median(Malay)
269
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
- except:
272
- English = -1
273
- Vietnamese = -1
274
- Chinese = -1
275
- Indonesian = -1
276
- Filipino = -1
277
- Spanish = -1
278
- Malay = -1
279
-
280
- res = {
281
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
282
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
283
- "English": English,
284
- "Vietnamese": Vietnamese,
285
- "Chinese": Chinese,
286
- "Indonesian": Indonesian,
287
- "Filipino": Filipino,
288
- "Spanish": Spanish,
289
- "Malay": Malay,
290
- }
291
-
292
- df_list.append(res)
293
 
 
 
294
 
295
  df = pd.DataFrame(df_list)
296
  # If there are any models that are the same, merge them
@@ -310,7 +284,6 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
310
 
311
  return df
312
 
313
-
314
  CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
315
  CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")
316
 
@@ -325,12 +298,11 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
325
  df_list = []
326
 
327
  for model in MODEL_LIST:
 
 
328
 
 
329
 
330
- results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
331
-
332
-
333
- try:
334
  overall_acc = [results['overall_acc'] for results in results_list]
335
  overall_acc = median(overall_acc)
336
 
@@ -340,20 +312,18 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
340
  AC3_3 = [results['AC3_3'] for results in results_list]
341
  AC3_3 = median(AC3_3)
342
 
343
- except:
344
- consistency_score_3 = -1
345
- overall_acc = -1
346
- AC3_3 = -1
 
 
 
347
 
348
- res = {
349
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
350
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
351
- "Accuracy": overall_acc,
352
- "Cross-Lingual Consistency": consistency_score_3,
353
- "AC3": AC3_3,
354
- }
355
 
356
- df_list.append(res)
 
357
 
358
 
359
  df = pd.DataFrame(df_list)
@@ -384,12 +354,11 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
384
  df_list = []
385
 
386
  for model in MODEL_LIST:
 
 
387
 
 
388
 
389
- results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
390
-
391
-
392
- try:
393
  English = [results['language_acc']['English'] for results in results_list]
394
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
395
  Chinese = [results['language_acc']['Chinese'] for results in results_list]
@@ -406,30 +375,24 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
406
  Spanish = median(Spanish)
407
  Malay = median(Malay)
408
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
- except:
411
- English = -1
412
- Vietnamese = -1
413
- Chinese = -1
414
- Indonesian = -1
415
- Filipino = -1
416
- Spanish = -1
417
- Malay = -1
418
-
419
- res = {
420
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
421
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
422
- "English": English,
423
- "Vietnamese": Vietnamese,
424
- "Chinese": Chinese,
425
- "Indonesian": Indonesian,
426
- "Filipino": Filipino,
427
- "Spanish": Spanish,
428
- "Malay": Malay,
429
- }
430
 
431
- df_list.append(res)
 
432
 
 
433
 
434
  df = pd.DataFrame(df_list)
435
  # If there are any models that are the same, merge them
@@ -462,24 +425,23 @@ def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
462
  df_list = []
463
 
464
  for model in MODEL_LIST:
465
-
466
-
467
- results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]
468
-
469
 
470
  try:
 
 
471
  accuracy = median([results['accuracy'] for results in results_list])
472
 
473
- except:
474
- accuracy = -1
 
 
 
475
 
476
- res = {
477
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
478
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
479
- "Accuracy": accuracy,
480
- }
481
 
482
- df_list.append(res)
483
 
484
 
485
  df = pd.DataFrame(df_list)
@@ -515,24 +477,20 @@ def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
515
 
516
  for model in MODEL_LIST:
517
 
518
-
519
- results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]
520
-
521
-
522
  try:
 
523
  accuracy = median([results['accuracy'] for results in results_list])
524
 
525
- except:
526
- accuracy = -1
527
-
 
 
528
 
529
- res = {
530
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
531
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
532
- "Accuracy": accuracy,
533
- }
534
 
535
- df_list.append(res)
 
536
 
537
 
538
  df = pd.DataFrame(df_list)
@@ -567,26 +525,21 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
567
  df_list = []
568
 
569
  for model in MODEL_LIST:
570
-
571
-
572
- results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]
573
-
574
 
575
  try:
 
576
  accuracy = median([results['accuracy'] for results in results_list])
577
 
578
- except:
579
- accuracy = -1
580
-
 
 
581
 
582
- res = {
583
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
584
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
585
- "Accuracy": accuracy,
586
- }
587
-
588
- df_list.append(res)
589
 
 
 
590
 
591
  df = pd.DataFrame(df_list)
592
  # If there are any models that are the same, merge them
@@ -606,7 +559,6 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
606
 
607
  return df
608
 
609
-
610
  CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
611
  CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")
612
 
@@ -614,7 +566,6 @@ CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")
614
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
615
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
616
 
617
-
618
  def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
619
 
620
  df_list = []
@@ -622,23 +573,21 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
622
  for model in MODEL_LIST:
623
 
624
 
625
- results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]
626
 
627
 
628
  try:
 
629
  accuracy = median([results['accuracy'] for results in results_list])
 
 
 
 
 
630
 
631
- except:
632
- accuracy = -1
633
-
634
 
635
- res = {
636
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
637
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
638
- "Accuracy": accuracy,
639
- }
640
-
641
- df_list.append(res)
642
 
643
 
644
  df = pd.DataFrame(df_list)
@@ -673,25 +622,21 @@ def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True):
673
  df_list = []
674
 
675
  for model in MODEL_LIST:
676
-
677
-
678
- results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']]
679
-
680
 
681
  try:
 
682
  bleu_score = median([results['bleu_score'] for results in results_list])
683
 
684
- except:
685
- bleu_score = -1
686
-
 
 
687
 
688
- res = {
689
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
690
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
691
- "BLEU": bleu_score,
692
- }
693
 
694
- df_list.append(res)
 
695
 
696
 
697
  df = pd.DataFrame(df_list)
@@ -725,25 +670,21 @@ def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True):
725
  df_list = []
726
 
727
  for model in MODEL_LIST:
728
-
729
-
730
- results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']]
731
-
732
 
733
  try:
 
734
  bleu_score = median([results['bleu_score'] for results in results_list])
735
 
736
- except:
737
- bleu_score = -1
738
-
 
 
739
 
740
- res = {
741
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
742
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
743
- "BLEU": bleu_score,
744
- }
745
 
746
- df_list.append(res)
 
747
 
748
 
749
  df = pd.DataFrame(df_list)
@@ -779,26 +720,21 @@ def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True):
779
  df_list = []
780
 
781
  for model in MODEL_LIST:
782
-
783
-
784
- results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']]
785
-
786
 
787
  try:
 
788
  bleu_score = median([results['bleu_score'] for results in results_list])
789
 
790
- except:
791
- bleu_score = -1
792
-
793
-
794
- res = {
795
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
796
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
797
- "BLEU": bleu_score,
798
- }
799
 
800
- df_list.append(res)
801
 
 
 
802
 
803
  df = pd.DataFrame(df_list)
804
  # If there are any models that are the same, merge them
@@ -831,26 +767,21 @@ def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):
831
  df_list = []
832
 
833
  for model in MODEL_LIST:
834
-
835
-
836
- results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']]
837
-
838
 
839
  try:
 
840
  bleu_score = median([results['bleu_score'] for results in results_list])
841
 
842
- except:
843
- bleu_score = -1
844
-
845
-
846
- res = {
847
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
848
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
849
- "BLEU": bleu_score,
850
- }
851
 
852
- df_list.append(res)
853
 
 
 
854
 
855
  df = pd.DataFrame(df_list)
856
  # If there are any models that are the same, merge them
@@ -870,7 +801,6 @@ def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):
870
 
871
  return df
872
 
873
-
874
  FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
875
  FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
876
 
@@ -884,26 +814,20 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
884
  df_list = []
885
 
886
  for model in MODEL_LIST:
887
-
888
-
889
- results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']]
890
-
891
-
892
  try:
 
893
  bleu_score = median([results['bleu_score'] for results in results_list])
894
 
895
- except:
896
- bleu_score = -1
897
-
898
-
899
- res = {
900
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
901
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
902
- "BLEU": bleu_score,
903
- }
904
-
905
- df_list.append(res)
906
 
 
 
907
 
908
  df = pd.DataFrame(df_list)
909
  # If there are any models that are the same, merge them
@@ -923,7 +847,6 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
923
 
924
  return df
925
 
926
-
927
  FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
928
  FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
929
 
@@ -937,27 +860,21 @@ def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):
937
  df_list = []
938
 
939
  for model in MODEL_LIST:
940
-
941
-
942
- results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
943
-
944
-
945
  try:
 
946
  accuracy = median([results['accuracy'] for results in results_list])
947
 
 
 
 
 
 
 
 
948
  except:
949
  accuracy = -1
950
 
951
-
952
- res = {
953
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
954
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
955
- "Accuracy": accuracy,
956
- }
957
-
958
- df_list.append(res)
959
-
960
-
961
  df = pd.DataFrame(df_list)
962
  # If there are any models that are the same, merge them
963
  # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
@@ -984,32 +901,26 @@ MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot")
984
 
985
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
986
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
987
-
988
-
989
  def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
990
 
991
  df_list = []
992
 
993
  for model in MODEL_LIST:
994
-
995
-
996
- results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
997
-
998
-
999
  try:
 
1000
  accuracy = median([results['accuracy'] for results in results_list])
1001
 
1002
- except:
1003
- accuracy = -1
1004
-
 
 
1005
 
1006
- res = {
1007
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1008
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1009
- "Accuracy": accuracy,
1010
- }
1011
 
1012
- df_list.append(res)
 
1013
 
1014
 
1015
  df = pd.DataFrame(df_list)
@@ -1030,40 +941,31 @@ def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
1030
 
1031
  return df
1032
 
1033
-
1034
  MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
1035
  MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")
1036
 
1037
 
1038
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1039
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1040
-
1041
-
1042
  def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
1043
 
1044
  df_list = []
1045
 
1046
- for model in MODEL_LIST:
1047
-
1048
-
1049
- results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
1050
-
1051
-
1052
  try:
 
1053
  accuracy = median([results['accuracy'] for results in results_list])
1054
 
1055
- except:
1056
- accuracy = -1
1057
-
1058
-
1059
- res = {
1060
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1061
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1062
- "Accuracy": accuracy,
1063
- }
1064
 
1065
- df_list.append(res)
1066
 
 
 
1067
 
1068
  df = pd.DataFrame(df_list)
1069
  # If there are any models that are the same, merge them
@@ -1083,7 +985,6 @@ def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
1083
 
1084
  return df
1085
 
1086
-
1087
  C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
1088
  C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")
1089
 
@@ -1097,25 +998,23 @@ def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):
1097
  df_list = []
1098
 
1099
  for model in MODEL_LIST:
1100
-
1101
-
1102
- results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
1103
-
1104
-
1105
  try:
 
1106
  accuracy = median([results['accuracy'] for results in results_list])
1107
 
1108
- except:
1109
- accuracy = -1
 
 
 
1110
 
 
 
 
 
1111
 
1112
- res = {
1113
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1114
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1115
- "Accuracy": accuracy,
1116
- }
1117
 
1118
- df_list.append(res)
1119
 
1120
 
1121
  df = pd.DataFrame(df_list)
@@ -1152,25 +1051,24 @@ def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):
1152
  df_list = []
1153
 
1154
  for model in MODEL_LIST:
1155
-
1156
-
1157
- results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
1158
-
1159
-
1160
  try:
 
1161
  accuracy = median([results['accuracy'] for results in results_list])
1162
 
 
 
 
 
 
 
 
 
1163
  except:
1164
- accuracy = -1
1165
 
1166
 
1167
- res = {
1168
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1169
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1170
- "Accuracy": accuracy,
1171
- }
1172
 
1173
- df_list.append(res)
1174
 
1175
 
1176
  df = pd.DataFrame(df_list)
@@ -1197,9 +1095,6 @@ CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot")
1197
 
1198
 
1199
 
1200
-
1201
-
1202
-
1203
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1204
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1205
 
@@ -1209,25 +1104,24 @@ def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
1209
  df_list = []
1210
 
1211
  for model in MODEL_LIST:
1212
-
1213
-
1214
- results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
1215
-
1216
 
1217
  try:
 
1218
  accuracy = median([results['accuracy'] for results in results_list])
1219
 
 
 
 
 
 
 
 
 
1220
  except:
1221
- accuracy = -1
1222
 
1223
 
1224
- res = {
1225
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1226
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1227
- "Accuracy": accuracy,
1228
- }
1229
 
1230
- df_list.append(res)
1231
 
1232
 
1233
  df = pd.DataFrame(df_list)
@@ -1263,25 +1157,20 @@ def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):
1263
  df_list = []
1264
 
1265
  for model in MODEL_LIST:
1266
-
1267
-
1268
- results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
1269
-
1270
-
1271
  try:
 
1272
  accuracy = median([results['accuracy'] for results in results_list])
1273
 
1274
- except:
1275
- accuracy = -1
1276
-
 
 
1277
 
1278
- res = {
1279
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1280
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1281
- "Accuracy": accuracy,
1282
- }
1283
 
1284
- df_list.append(res)
 
1285
 
1286
 
1287
  df = pd.DataFrame(df_list)
@@ -1316,21 +1205,23 @@ def get_data_indommlu(eval_mode='zero_shot', fillna=True, rank=True):
1316
 
1317
  for model in MODEL_LIST:
1318
 
1319
- results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']]
1320
 
1321
  try:
 
1322
  accuracy = median([results['accuracy'] for results in results_list])
1323
 
 
 
 
 
 
 
 
 
1324
  except:
1325
- accuracy = -1
1326
 
1327
- res = {
1328
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1329
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1330
- "Accuracy": accuracy,
1331
- }
1332
 
1333
- df_list.append(res)
1334
 
1335
 
1336
  df = pd.DataFrame(df_list)
@@ -1358,33 +1249,25 @@ INDOMMLU_FIVE_SHOT = get_data_indommlu(eval_mode="five_shot")
1358
 
1359
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1360
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1361
-
1362
-
1363
  def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
1364
 
1365
  df_list = []
1366
 
1367
  for model in MODEL_LIST:
1368
-
1369
-
1370
- results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']]
1371
-
1372
-
1373
  try:
 
1374
  accuracy = median([results['accuracy'] for results in results_list])
1375
 
1376
- except:
1377
- accuracy = -1
1378
-
1379
-
1380
- res = {
1381
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1382
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1383
- "Accuracy": accuracy,
1384
- }
1385
 
1386
- df_list.append(res)
1387
 
 
 
1388
 
1389
  df = pd.DataFrame(df_list)
1390
  # If there are any models that are the same, merge them
@@ -1404,7 +1287,6 @@ def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
1404
 
1405
  return df
1406
 
1407
-
1408
  IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot")
1409
  IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot")
1410
 
@@ -1420,25 +1302,21 @@ def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True):
1420
  df_list = []
1421
 
1422
  for model in MODEL_LIST:
1423
-
1424
-
1425
- results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']]
1426
-
1427
 
1428
  try:
 
1429
  accuracy = median([results['accuracy'] for results in results_list])
1430
 
1431
- except:
1432
- accuracy = -1
1433
-
 
 
1434
 
1435
- res = {
1436
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1437
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1438
- "Accuracy": accuracy,
1439
- }
1440
 
1441
- df_list.append(res)
 
1442
 
1443
 
1444
  df = pd.DataFrame(df_list)
@@ -1474,26 +1352,21 @@ def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True):
1474
  df_list = []
1475
 
1476
  for model in MODEL_LIST:
1477
-
1478
-
1479
- results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']]
1480
-
1481
 
1482
  try:
 
1483
  accuracy = median([results['accuracy'] for results in results_list])
1484
 
1485
- except:
1486
- accuracy = -1
1487
-
 
 
1488
 
1489
- res = {
1490
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1491
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1492
- "Accuracy": accuracy,
1493
- }
1494
-
1495
- df_list.append(res)
1496
 
 
 
1497
 
1498
  df = pd.DataFrame(df_list)
1499
  # If there are any models that are the same, merge them
@@ -1528,25 +1401,21 @@ def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):
1528
  df_list = []
1529
 
1530
  for model in MODEL_LIST:
1531
-
1532
-
1533
- results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']]
1534
-
1535
 
1536
  try:
 
1537
  accuracy = median([results['accuracy'] for results in results_list])
1538
 
1539
- except:
1540
- accuracy = -1
1541
-
 
 
1542
 
1543
- res = {
1544
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1545
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1546
- "Accuracy": accuracy,
1547
- }
1548
 
1549
- df_list.append(res)
 
1550
 
1551
 
1552
  df = pd.DataFrame(df_list)
@@ -1567,47 +1436,36 @@ def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):
1567
 
1568
  return df
1569
 
1570
-
1571
  DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot")
1572
  DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot")
1573
 
1574
-
1575
-
1576
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1577
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1578
-
1579
-
1580
  def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):
1581
 
1582
  df_list = []
1583
 
1584
  for model in MODEL_LIST:
1585
-
1586
-
1587
- results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']]
1588
-
1589
 
1590
  try:
 
 
1591
  rouge1 = median([results['rouge1'] for results in results_list])
1592
  rouge2 = median([results['rouge2'] for results in results_list])
1593
  rougeL = median([results['rougeL'] for results in results_list])
1594
 
1595
- except:
1596
- rouge1 = -1
1597
- rouge2 = -1
1598
- rougeL = -1
1599
-
 
 
1600
 
1601
- res = {
1602
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1603
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1604
- "ROUGE-1": rouge1,
1605
- "ROUGE-2": rouge2,
1606
- "ROUGE-L": rougeL,
1607
- }
1608
-
1609
- df_list.append(res)
1610
 
 
 
1611
 
1612
  df = pd.DataFrame(df_list)
1613
  # If there are any models that are the same, merge them
@@ -1641,31 +1499,29 @@ def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True):
1641
  df_list = []
1642
 
1643
  for model in MODEL_LIST:
1644
-
1645
-
1646
- results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']]
1647
-
1648
-
1649
  try:
 
 
1650
  rouge1 = median([results['rouge1'] for results in results_list])
1651
  rouge2 = median([results['rouge2'] for results in results_list])
1652
  rougeL = median([results['rougeL'] for results in results_list])
1653
 
 
 
 
 
 
 
 
 
 
 
1654
  except:
1655
- rouge1 = -1
1656
- rouge2 = -1
1657
- rougeL = -1
1658
 
1659
 
1660
- res = {
1661
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1662
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1663
- "ROUGE-1": rouge1,
1664
- "ROUGE-2": rouge2,
1665
- "ROUGE-L": rougeL,
1666
- }
1667
 
1668
- df_list.append(res)
1669
 
1670
 
1671
  df = pd.DataFrame(df_list)
@@ -1703,24 +1559,23 @@ def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True):
1703
 
1704
  for model in MODEL_LIST:
1705
 
1706
-
1707
- results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']]
1708
-
1709
-
1710
  try:
 
1711
  accuracy = median([results['accuracy'] for results in results_list])
1712
 
 
 
 
 
 
 
 
 
1713
  except:
1714
- accuracy = -1
1715
 
1716
 
1717
- res = {
1718
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1719
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1720
- "Accuracy": accuracy,
1721
- }
1722
 
1723
- df_list.append(res)
1724
 
1725
 
1726
  df = pd.DataFrame(df_list)
@@ -1757,26 +1612,21 @@ def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True):
1757
  df_list = []
1758
 
1759
  for model in MODEL_LIST:
1760
-
1761
-
1762
- results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']]
1763
-
1764
 
1765
  try:
 
1766
  accuracy = median([results['accuracy'] for results in results_list])
1767
 
1768
- except:
1769
- accuracy = -1
1770
-
1771
-
1772
- res = {
1773
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1774
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1775
- "Accuracy": accuracy,
1776
- }
1777
 
1778
- df_list.append(res)
1779
 
 
 
1780
 
1781
  df = pd.DataFrame(df_list)
1782
  # If there are any models that are the same, merge them
@@ -1814,24 +1664,20 @@ def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True):
1814
 
1815
  for model in MODEL_LIST:
1816
 
1817
-
1818
- results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']]
1819
-
1820
-
1821
  try:
 
1822
  accuracy = median([results['accuracy'] for results in results_list])
1823
 
1824
- except:
1825
- accuracy = -1
1826
-
 
 
1827
 
1828
- res = {
1829
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1830
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1831
- "Accuracy": accuracy,
1832
- }
1833
 
1834
- df_list.append(res)
 
1835
 
1836
 
1837
  df = pd.DataFrame(df_list)
@@ -1869,25 +1715,21 @@ def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True):
1869
  df_list = []
1870
 
1871
  for model in MODEL_LIST:
1872
-
1873
-
1874
- results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']]
1875
-
1876
-
1877
  try:
 
1878
  accuracy = median([results['accuracy'] for results in results_list])
1879
 
1880
- except:
1881
- accuracy = -1
1882
-
 
 
1883
 
1884
- res = {
1885
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1886
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1887
- "Accuracy": accuracy,
1888
- }
1889
 
1890
- df_list.append(res)
 
1891
 
1892
 
1893
  df = pd.DataFrame(df_list)
@@ -1925,26 +1767,21 @@ def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True):
1925
  df_list = []
1926
 
1927
  for model in MODEL_LIST:
1928
-
1929
-
1930
- results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']]
1931
-
1932
 
1933
  try:
 
1934
  accuracy = median([results['accuracy'] for results in results_list])
1935
 
1936
- except:
1937
- accuracy = -1
1938
-
1939
-
1940
- res = {
1941
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1942
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1943
- "Accuracy": accuracy,
1944
- }
1945
 
1946
- df_list.append(res)
1947
 
 
 
1948
 
1949
  df = pd.DataFrame(df_list)
1950
  # If there are any models that are the same, merge them
@@ -1981,26 +1818,21 @@ def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):
1981
  df_list = []
1982
 
1983
  for model in MODEL_LIST:
1984
-
1985
-
1986
- results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']]
1987
-
1988
 
1989
  try:
 
1990
  accuracy = median([results['accuracy'] for results in results_list])
1991
 
1992
- except:
1993
- accuracy = -1
1994
-
1995
-
1996
- res = {
1997
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1998
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1999
- "Accuracy": accuracy,
2000
- }
2001
 
2002
- df_list.append(res)
2003
 
 
 
2004
 
2005
  df = pd.DataFrame(df_list)
2006
  # If there are any models that are the same, merge them
@@ -2020,14 +1852,10 @@ def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):
2020
 
2021
  return df
2022
 
2023
-
2024
  WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot")
2025
  WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot")
2026
 
2027
 
2028
-
2029
-
2030
-
2031
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2032
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2033
 
@@ -2037,26 +1865,20 @@ def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True):
2037
  df_list = []
2038
 
2039
  for model in MODEL_LIST:
2040
-
2041
-
2042
- results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']]
2043
-
2044
-
2045
  try:
 
2046
  accuracy = median([results['accuracy'] for results in results_list])
2047
 
2048
- except:
2049
- accuracy = -1
2050
-
2051
-
2052
- res = {
2053
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
2054
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
2055
- "Accuracy": accuracy,
2056
- }
2057
 
2058
- df_list.append(res)
2059
 
 
 
2060
 
2061
  df = pd.DataFrame(df_list)
2062
  # If there are any models that are the same, merge them
@@ -2081,39 +1903,28 @@ RTE_ZERO_SHOT = get_data_rte(eval_mode="zero_shot")
2081
  RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot")
2082
 
2083
 
2084
-
2085
-
2086
-
2087
-
2088
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2089
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2090
-
2091
-
2092
  def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):
2093
 
2094
  df_list = []
2095
 
2096
  for model in MODEL_LIST:
2097
-
2098
-
2099
- results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']]
2100
-
2101
-
2102
  try:
 
2103
  accuracy = median([results['accuracy'] for results in results_list])
2104
 
2105
- except:
2106
- accuracy = -1
2107
-
 
 
2108
 
2109
- res = {
2110
- "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
2111
- "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
2112
- "Accuracy": accuracy,
2113
- }
2114
-
2115
- df_list.append(res)
2116
 
 
 
2117
 
2118
  df = pd.DataFrame(df_list)
2119
  # If there are any models that are the same, merge them
@@ -2210,8 +2021,8 @@ with block:
2210
  - **Mode of Evaluation**: Zero-Shot, Five-Shot
2211
 
2212
  ### The following table shows the performance of the models on the SeaEval benchmark.
2213
- - For **Zero-shot** performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts.
2214
- - (-1) value indicates the results are ready yet.
2215
  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2216
 
2217
  """)
@@ -2348,7 +2159,7 @@ with block:
2348
 
2349
 
2350
 
2351
- with gr.TabItem("Cultural Reasoning and Understanding"):
2352
 
2353
  # dataset 3: SG_EVAL
2354
  with gr.TabItem("SG_EVAL"):
@@ -2697,7 +2508,7 @@ with block:
2697
  """)
2698
 
2699
 
2700
- with gr.TabItem("FLORES Translation"):
2701
 
2702
 
2703
  # dataset 8:
@@ -2805,7 +2616,7 @@ with block:
2805
  """)
2806
 
2807
 
2808
- with gr.TabItem("Emotion Recognition"):
2809
 
2810
  # dataset 18:
2811
  with gr.TabItem("ind_emotion"):
@@ -2941,7 +2752,7 @@ with block:
2941
 
2942
 
2943
 
2944
- with gr.TabItem("Fundamental NLP"):
2945
 
2946
 
2947
  # dataset
 
55
  df_list = []
56
 
57
  for model in MODEL_LIST:
 
 
 
 
58
 
59
  try:
60
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
61
+
62
  overall_acc = [results['overall_acc'] for results in results_list]
63
  overall_acc = median(overall_acc)
64
 
 
68
  AC3_3 = [results['AC3_3'] for results in results_list]
69
  AC3_3 = median(AC3_3)
70
 
71
+ res = {
72
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
73
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
74
+ "Accuracy": overall_acc,
75
+ "Cross-Lingual Consistency": consistency_score_3,
76
+ "AC3": AC3_3,
77
+ }
78
 
79
+ df_list.append(res)
 
 
 
 
 
 
80
 
81
+ except:
82
+ print('Not found in model: {} for {}'.format(model, "cross_xquad_overall"))
83
 
84
 
85
  df = pd.DataFrame(df_list)
 
100
 
101
  return df
102
 
 
103
  CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
104
  CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")
105
 
 
109
  df_list = []
110
 
111
  for model in MODEL_LIST:
112
+
 
 
 
 
113
  try:
114
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
115
+
116
  English = [results['language_acc']['English'] for results in results_list]
117
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
118
  Chinese = [results['language_acc']['Chinese'] for results in results_list]
 
123
  Chinese = median(Chinese)
124
  Spanish = median(Spanish)
125
 
126
+ res = {
127
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
128
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
129
+ "English": English,
130
+ "Vietnamese": Vietnamese,
131
+ "Chinese": Chinese,
132
+ "Spanish": Spanish,
133
+ }
134
 
135
+ df_list.append(res)
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ except:
138
+ print('Not found in model: {} for {}'.format(model, "cross_xquad_lang"))
139
 
140
 
141
  df = pd.DataFrame(df_list)
 
156
 
157
  return df
158
 
 
159
  CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
160
  CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")
161
 
 
174
  df_list = []
175
 
176
  for model in MODEL_LIST:
 
 
 
 
177
 
178
  try:
179
+
180
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
181
+
182
  overall_acc = [results['overall_acc'] for results in results_list]
183
  overall_acc = median(overall_acc)
184
 
 
188
  AC3_3 = [results['AC3_3'] for results in results_list]
189
  AC3_3 = median(AC3_3)
190
 
191
+ res = {
192
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
193
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
194
+ "Accuracy": overall_acc,
195
+ "Cross-Lingual Consistency": consistency_score_3,
196
+ "AC3": AC3_3,
197
+ }
198
+ df_list.append(res)
 
 
 
 
199
 
200
+ except:
201
+ print('Not found in model: {} for {}'.format(model, "cross_mmlu_overall"))
202
 
203
 
204
  df = pd.DataFrame(df_list)
 
219
 
220
  return df
221
 
 
222
  CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
223
  CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")
224
 
 
228
  df_list = []
229
 
230
  for model in MODEL_LIST:
231
+
232
+ try:
233
 
234
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
235
 
 
 
 
 
236
  English = [results['language_acc']['English'] for results in results_list]
237
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
238
  Chinese = [results['language_acc']['Chinese'] for results in results_list]
 
249
  Spanish = median(Spanish)
250
  Malay = median(Malay)
251
 
252
+ res = {
253
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
254
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
255
+ "English": English,
256
+ "Vietnamese": Vietnamese,
257
+ "Chinese": Chinese,
258
+ "Indonesian": Indonesian,
259
+ "Filipino": Filipino,
260
+ "Spanish": Spanish,
261
+ "Malay": Malay,
262
+ }
263
 
264
+ df_list.append(res)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
+ except:
267
+ print('Not found in model: {} for {}'.format(model, "cross_mmlu_lang"))
268
 
269
  df = pd.DataFrame(df_list)
270
  # If there are any models that are the same, merge them
 
284
 
285
  return df
286
 
 
287
  CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
288
  CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")
289
 
 
298
  df_list = []
299
 
300
  for model in MODEL_LIST:
301
+
302
+ try:
303
 
304
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
305
 
 
 
 
 
306
  overall_acc = [results['overall_acc'] for results in results_list]
307
  overall_acc = median(overall_acc)
308
 
 
312
  AC3_3 = [results['AC3_3'] for results in results_list]
313
  AC3_3 = median(AC3_3)
314
 
315
+ res = {
316
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
317
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
318
+ "Accuracy": overall_acc,
319
+ "Cross-Lingual Consistency": consistency_score_3,
320
+ "AC3": AC3_3,
321
+ }
322
 
323
+ df_list.append(res)
 
 
 
 
 
 
324
 
325
+ except:
326
+ print('Not found in model: {} for {}'.format(model, "cross_logiqa_overall"))
327
 
328
 
329
  df = pd.DataFrame(df_list)
 
354
  df_list = []
355
 
356
  for model in MODEL_LIST:
357
+
358
+ try:
359
 
360
+ results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
361
 
 
 
 
 
362
  English = [results['language_acc']['English'] for results in results_list]
363
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
364
  Chinese = [results['language_acc']['Chinese'] for results in results_list]
 
375
  Spanish = median(Spanish)
376
  Malay = median(Malay)
377
 
378
+ res = {
379
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
380
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
381
+ "English": English,
382
+ "Vietnamese": Vietnamese,
383
+ "Chinese": Chinese,
384
+ "Indonesian": Indonesian,
385
+ "Filipino": Filipino,
386
+ "Spanish": Spanish,
387
+ "Malay": Malay,
388
+ }
389
 
390
+ df_list.append(res)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
+ except:
393
+ print('Not found in model: {} for {}'.format(model, "cross_logiqa_language"))
394
 
395
+
396
 
397
  df = pd.DataFrame(df_list)
398
  # If there are any models that are the same, merge them
 
425
  df_list = []
426
 
427
  for model in MODEL_LIST:
 
 
 
 
428
 
429
  try:
430
+
431
+ results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]
432
  accuracy = median([results['accuracy'] for results in results_list])
433
 
434
+ res = {
435
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
436
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
437
+ "Accuracy": accuracy,
438
+ }
439
 
440
+ df_list.append(res)
441
+
442
+ except:
443
+ print('Not found in model: {} for {}'.format(model, "sg_eval"))
 
444
 
 
445
 
446
 
447
  df = pd.DataFrame(df_list)
 
477
 
478
  for model in MODEL_LIST:
479
 
 
 
 
 
480
  try:
481
+ results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]
482
  accuracy = median([results['accuracy'] for results in results_list])
483
 
484
+ res = {
485
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
486
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
487
+ "Accuracy": accuracy,
488
+ }
489
 
490
+ df_list.append(res)
 
 
 
 
491
 
492
+ except:
493
+ print('Not found in model: {} for {}'.format(model, "us_eval"))
494
 
495
 
496
  df = pd.DataFrame(df_list)
 
525
  df_list = []
526
 
527
  for model in MODEL_LIST:
 
 
 
 
528
 
529
  try:
530
+ results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]
531
  accuracy = median([results['accuracy'] for results in results_list])
532
 
533
+ res = {
534
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
535
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
536
+ "Accuracy": accuracy,
537
+ }
538
 
539
+ df_list.append(res)
 
 
 
 
 
 
540
 
541
+ except:
542
+ print('Not found in model: {} for {}'.format(model, "cn_eval"))
543
 
544
  df = pd.DataFrame(df_list)
545
  # If there are any models that are the same, merge them
 
559
 
560
  return df
561
 
 
562
  CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
563
  CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")
564
 
 
566
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
567
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
568
 
 
569
  def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
570
 
571
  df_list = []
 
573
  for model in MODEL_LIST:
574
 
575
 
 
576
 
577
 
578
  try:
579
+ results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]
580
  accuracy = median([results['accuracy'] for results in results_list])
581
+ res = {
582
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
583
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
584
+ "Accuracy": accuracy,
585
+ }
586
 
587
+ df_list.append(res)
 
 
588
 
589
+ except:
590
+ print('Not found in model: {} for {}'.format(model, "ph_eval"))
 
 
 
 
 
591
 
592
 
593
  df = pd.DataFrame(df_list)
 
622
  df_list = []
623
 
624
  for model in MODEL_LIST:
 
 
 
 
625
 
626
  try:
627
+ results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']]
628
  bleu_score = median([results['bleu_score'] for results in results_list])
629
 
630
+ res = {
631
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
632
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
633
+ "BLEU": bleu_score,
634
+ }
635
 
636
+ df_list.append(res)
 
 
 
 
637
 
638
+ except:
639
+ print('Not found in model: {} for {}'.format(model, "sing2eng"))
640
 
641
 
642
  df = pd.DataFrame(df_list)
 
670
  df_list = []
671
 
672
  for model in MODEL_LIST:
 
 
 
 
673
 
674
  try:
675
+ results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']]
676
  bleu_score = median([results['bleu_score'] for results in results_list])
677
 
678
+ res = {
679
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
680
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
681
+ "BLEU": bleu_score,
682
+ }
683
 
684
+ df_list.append(res)
 
 
 
 
685
 
686
+ except:
687
+ print('Not found in model: {} for {}'.format(model, "flores_ind2eng"))
688
 
689
 
690
  df = pd.DataFrame(df_list)
 
720
  df_list = []
721
 
722
  for model in MODEL_LIST:
 
 
 
 
723
 
724
  try:
725
+ results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']]
726
  bleu_score = median([results['bleu_score'] for results in results_list])
727
 
728
+ res = {
729
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
730
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
731
+ "BLEU": bleu_score,
732
+ }
 
 
 
 
733
 
734
+ df_list.append(res)
735
 
736
+ except:
737
+ print('Not found in model: {} for {}'.format(model, "flores_vie2eng"))
738
 
739
  df = pd.DataFrame(df_list)
740
  # If there are any models that are the same, merge them
 
767
  df_list = []
768
 
769
  for model in MODEL_LIST:
 
 
 
 
770
 
771
  try:
772
+ results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']]
773
  bleu_score = median([results['bleu_score'] for results in results_list])
774
 
775
+ res = {
776
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
777
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
778
+ "BLEU": bleu_score,
779
+ }
 
 
 
 
780
 
781
+ df_list.append(res)
782
 
783
+ except:
784
+ print('Not found in model: {} for {}'.format(model, "flores_zho2eng"))
785
 
786
  df = pd.DataFrame(df_list)
787
  # If there are any models that are the same, merge them
 
801
 
802
  return df
803
 
 
804
  FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
805
  FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
806
 
 
814
  df_list = []
815
 
816
  for model in MODEL_LIST:
817
+
 
 
 
 
818
  try:
819
+ results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']]
820
  bleu_score = median([results['bleu_score'] for results in results_list])
821
 
822
+ res = {
823
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
824
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
825
+ "BLEU": bleu_score,
826
+ }
827
+ df_list.append(res)
 
 
 
 
 
828
 
829
+ except:
830
+ print('Not found in model: {} for {}'.format(model, "flores_zsm2eng"))
831
 
832
  df = pd.DataFrame(df_list)
833
  # If there are any models that are the same, merge them
 
847
 
848
  return df
849
 
 
850
  FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
851
  FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
852
 
 
860
  df_list = []
861
 
862
  for model in MODEL_LIST:
863
+
 
 
 
 
864
  try:
865
+ results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
866
  accuracy = median([results['accuracy'] for results in results_list])
867
 
868
+ res = {
869
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
870
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
871
+ "Accuracy": accuracy,
872
+ }
873
+ df_list.append(res)
874
+
875
  except:
876
  accuracy = -1
877
 
 
 
 
 
 
 
 
 
 
 
878
  df = pd.DataFrame(df_list)
879
  # If there are any models that are the same, merge them
880
  # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
 
901
 
902
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
903
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
904
  def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
905
 
906
  df_list = []
907
 
908
  for model in MODEL_LIST:
909
+
 
 
 
 
910
  try:
911
+ results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
912
  accuracy = median([results['accuracy'] for results in results_list])
913
 
914
+ res = {
915
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
916
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
917
+ "Accuracy": accuracy,
918
+ }
919
 
920
+ df_list.append(res)
 
 
 
 
921
 
922
+ except:
923
+ print('Not found in model: {} for {}'.format(model, "mmlu_full"))
924
 
925
 
926
  df = pd.DataFrame(df_list)
 
941
 
942
  return df
943
 
 
944
  MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
945
  MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")
946
 
947
 
948
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
949
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
950
  def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
951
 
952
  df_list = []
953
 
954
+ for model in MODEL_LIST:
 
 
 
 
 
955
  try:
956
+ results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
957
  accuracy = median([results['accuracy'] for results in results_list])
958
 
959
+ res = {
960
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
961
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
962
+ "Accuracy": accuracy,
963
+ }
 
 
 
 
964
 
965
+ df_list.append(res)
966
 
967
+ except:
968
+ print('Not found in model: {} for {}'.format(model, "c_eval"))
969
 
970
  df = pd.DataFrame(df_list)
971
  # If there are any models that are the same, merge them
 
985
 
986
  return df
987
 
 
988
  C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
989
  C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")
990
 
 
998
  df_list = []
999
 
1000
  for model in MODEL_LIST:
1001
+
 
 
 
 
1002
  try:
1003
+ results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
1004
  accuracy = median([results['accuracy'] for results in results_list])
1005
 
1006
+ res = {
1007
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1008
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1009
+ "Accuracy": accuracy,
1010
+ }
1011
 
1012
+ df_list.append(res)
1013
+
1014
+ except:
1015
+ print('Not found in model: {} for {}'.format(model, "c_eval_full"))
1016
 
 
 
 
 
 
1017
 
 
1018
 
1019
 
1020
  df = pd.DataFrame(df_list)
 
1051
  df_list = []
1052
 
1053
  for model in MODEL_LIST:
1054
+
 
 
 
 
1055
  try:
1056
+ results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
1057
  accuracy = median([results['accuracy'] for results in results_list])
1058
 
1059
+ res = {
1060
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1061
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1062
+ "Accuracy": accuracy,
1063
+ }
1064
+
1065
+ df_list.append(res)
1066
+
1067
  except:
1068
+ print('Not found in model: {} for {}'.format(model, "cmmlu"))
1069
 
1070
 
 
 
 
 
 
1071
 
 
1072
 
1073
 
1074
  df = pd.DataFrame(df_list)
 
1095
 
1096
 
1097
 
 
 
 
1098
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1099
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1100
 
 
1104
  df_list = []
1105
 
1106
  for model in MODEL_LIST:
 
 
 
 
1107
 
1108
  try:
1109
+ results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
1110
  accuracy = median([results['accuracy'] for results in results_list])
1111
 
1112
+ res = {
1113
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1114
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1115
+ "Accuracy": accuracy,
1116
+ }
1117
+
1118
+ df_list.append(res)
1119
+
1120
  except:
1121
+ print('Not found in model: {} for {}'.format(model, "cmmlu_full"))
1122
 
1123
 
 
 
 
 
 
1124
 
 
1125
 
1126
 
1127
  df = pd.DataFrame(df_list)
 
1157
  df_list = []
1158
 
1159
  for model in MODEL_LIST:
 
 
 
 
 
1160
  try:
1161
+ results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
1162
  accuracy = median([results['accuracy'] for results in results_list])
1163
 
1164
+ res = {
1165
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1166
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1167
+ "Accuracy": accuracy,
1168
+ }
1169
 
1170
+ df_list.append(res)
 
 
 
 
1171
 
1172
+ except:
1173
+ print('Not found in model: {} for {}'.format(model, "zbench"))
1174
 
1175
 
1176
  df = pd.DataFrame(df_list)
 
1205
 
1206
  for model in MODEL_LIST:
1207
 
 
1208
 
1209
  try:
1210
+ results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']]
1211
  accuracy = median([results['accuracy'] for results in results_list])
1212
 
1213
+ res = {
1214
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1215
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1216
+ "Accuracy": accuracy,
1217
+ }
1218
+
1219
+ df_list.append(res)
1220
+
1221
  except:
1222
+ print('Not found in model: {} for {}'.format(model, "indommlu"))
1223
 
 
 
 
 
 
1224
 
 
1225
 
1226
 
1227
  df = pd.DataFrame(df_list)
 
1249
 
1250
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1251
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
1252
  def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
1253
 
1254
  df_list = []
1255
 
1256
  for model in MODEL_LIST:
 
 
 
 
 
1257
  try:
1258
+ results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']]
1259
  accuracy = median([results['accuracy'] for results in results_list])
1260
 
1261
+ res = {
1262
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1263
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1264
+ "Accuracy": accuracy,
1265
+ }
 
 
 
 
1266
 
1267
+ df_list.append(res)
1268
 
1269
+ except:
1270
+ print('Not found in model: {} for {}'.format(model, "ind_emotion"))
1271
 
1272
  df = pd.DataFrame(df_list)
1273
  # If there are any models that are the same, merge them
 
1287
 
1288
  return df
1289
 
 
1290
  IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot")
1291
  IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot")
1292
 
 
1302
  df_list = []
1303
 
1304
  for model in MODEL_LIST:
 
 
 
 
1305
 
1306
  try:
1307
+ results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']]
1308
  accuracy = median([results['accuracy'] for results in results_list])
1309
 
1310
+ res = {
1311
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1312
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1313
+ "Accuracy": accuracy,
1314
+ }
1315
 
1316
+ df_list.append(res)
 
 
 
 
1317
 
1318
+ except:
1319
+ print('Not found in model: {} for {}'.format(model, "ocnli"))
1320
 
1321
 
1322
  df = pd.DataFrame(df_list)
 
1352
  df_list = []
1353
 
1354
  for model in MODEL_LIST:
 
 
 
 
1355
 
1356
  try:
1357
+ results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']]
1358
  accuracy = median([results['accuracy'] for results in results_list])
1359
 
1360
+ res = {
1361
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1362
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1363
+ "Accuracy": accuracy,
1364
+ }
1365
 
1366
+ df_list.append(res)
 
 
 
 
 
 
1367
 
1368
+ except:
1369
+ print('Not found in model: {} for {}'.format(model, "c3"))
1370
 
1371
  df = pd.DataFrame(df_list)
1372
  # If there are any models that are the same, merge them
 
1401
  df_list = []
1402
 
1403
  for model in MODEL_LIST:
 
 
 
 
1404
 
1405
  try:
1406
+ results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']]
1407
  accuracy = median([results['accuracy'] for results in results_list])
1408
 
1409
+ res = {
1410
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1411
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1412
+ "Accuracy": accuracy,
1413
+ }
1414
 
1415
+ df_list.append(res)
 
 
 
 
1416
 
1417
+ except:
1418
+ print('Not found in model: {} for {}'.format(model, "dream"))
1419
 
1420
 
1421
  df = pd.DataFrame(df_list)
 
1436
 
1437
  return df
1438
 
 
1439
  DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot")
1440
  DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot")
1441
 
 
 
1442
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1443
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
1444
  def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):
1445
 
1446
  df_list = []
1447
 
1448
  for model in MODEL_LIST:
 
 
 
 
1449
 
1450
  try:
1451
+ results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']]
1452
+
1453
  rouge1 = median([results['rouge1'] for results in results_list])
1454
  rouge2 = median([results['rouge2'] for results in results_list])
1455
  rougeL = median([results['rougeL'] for results in results_list])
1456
 
1457
+ res = {
1458
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1459
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1460
+ "ROUGE-1": rouge1,
1461
+ "ROUGE-2": rouge2,
1462
+ "ROUGE-L": rougeL,
1463
+ }
1464
 
1465
+ df_list.append(res)
 
 
 
 
 
 
 
 
1466
 
1467
+ except:
1468
+ print('Not found in model: {} for {}'.format(model, "samsum"))
1469
 
1470
  df = pd.DataFrame(df_list)
1471
  # If there are any models that are the same, merge them
 
1499
  df_list = []
1500
 
1501
  for model in MODEL_LIST:
1502
+
 
 
 
 
1503
  try:
1504
+ results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']]
1505
+
1506
  rouge1 = median([results['rouge1'] for results in results_list])
1507
  rouge2 = median([results['rouge2'] for results in results_list])
1508
  rougeL = median([results['rougeL'] for results in results_list])
1509
 
1510
+ res = {
1511
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1512
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1513
+ "ROUGE-1": rouge1,
1514
+ "ROUGE-2": rouge2,
1515
+ "ROUGE-L": rougeL,
1516
+ }
1517
+
1518
+ df_list.append(res)
1519
+
1520
  except:
1521
+ print('Not found in model: {} for {}'.format(model, "dialogsum"))
 
 
1522
 
1523
 
 
 
 
 
 
 
 
1524
 
 
1525
 
1526
 
1527
  df = pd.DataFrame(df_list)
 
1559
 
1560
  for model in MODEL_LIST:
1561
 
 
 
 
 
1562
  try:
1563
+ results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']]
1564
  accuracy = median([results['accuracy'] for results in results_list])
1565
 
1566
+ res = {
1567
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1568
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1569
+ "Accuracy": accuracy,
1570
+ }
1571
+
1572
+ df_list.append(res)
1573
+
1574
  except:
1575
+ print('Not found in model: {} for {}'.format(model, "sst2"))
1576
 
1577
 
 
 
 
 
 
1578
 
 
1579
 
1580
 
1581
  df = pd.DataFrame(df_list)
 
1612
  df_list = []
1613
 
1614
  for model in MODEL_LIST:
 
 
 
 
1615
 
1616
  try:
1617
+ results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']]
1618
  accuracy = median([results['accuracy'] for results in results_list])
1619
 
1620
+ res = {
1621
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1622
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1623
+ "Accuracy": accuracy,
1624
+ }
 
 
 
 
1625
 
1626
+ df_list.append(res)
1627
 
1628
+ except:
1629
+ print('Not found in model: {} for {}'.format(model, "cola"))
1630
 
1631
  df = pd.DataFrame(df_list)
1632
  # If there are any models that are the same, merge them
 
1664
 
1665
  for model in MODEL_LIST:
1666
 
 
 
 
 
1667
  try:
1668
+ results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']]
1669
  accuracy = median([results['accuracy'] for results in results_list])
1670
 
1671
+ res = {
1672
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1673
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1674
+ "Accuracy": accuracy,
1675
+ }
1676
 
1677
+ df_list.append(res)
 
 
 
 
1678
 
1679
+ except:
1680
+ print('Not found in model: {} for {}'.format(model, "qqp"))
1681
 
1682
 
1683
  df = pd.DataFrame(df_list)
 
1715
  df_list = []
1716
 
1717
  for model in MODEL_LIST:
1718
+
 
 
 
 
1719
  try:
1720
+ results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']]
1721
  accuracy = median([results['accuracy'] for results in results_list])
1722
 
1723
+ res = {
1724
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1725
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1726
+ "Accuracy": accuracy,
1727
+ }
1728
 
1729
+ df_list.append(res)
 
 
 
 
1730
 
1731
+ except:
1732
+ print('Not found in model: {} for {}'.format(model, "mnli"))
1733
 
1734
 
1735
  df = pd.DataFrame(df_list)
 
1767
  df_list = []
1768
 
1769
  for model in MODEL_LIST:
 
 
 
 
1770
 
1771
  try:
1772
+ results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']]
1773
  accuracy = median([results['accuracy'] for results in results_list])
1774
 
1775
+ res = {
1776
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1777
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1778
+ "Accuracy": accuracy,
1779
+ }
 
 
 
 
1780
 
1781
+ df_list.append(res)
1782
 
1783
+ except:
1784
+ print('Not found in model: {} for {}'.format(model, "qnli"))
1785
 
1786
  df = pd.DataFrame(df_list)
1787
  # If there are any models that are the same, merge them
 
1818
  df_list = []
1819
 
1820
  for model in MODEL_LIST:
 
 
 
 
1821
 
1822
  try:
1823
+ results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']]
1824
  accuracy = median([results['accuracy'] for results in results_list])
1825
 
1826
+ res = {
1827
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1828
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1829
+ "Accuracy": accuracy,
1830
+ }
 
 
 
 
1831
 
1832
+ df_list.append(res)
1833
 
1834
+ except:
1835
+ print('Not found in model: {} for {}'.format(model, "wnli"))
1836
 
1837
  df = pd.DataFrame(df_list)
1838
  # If there are any models that are the same, merge them
 
1852
 
1853
  return df
1854
 
 
1855
  WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot")
1856
  WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot")
1857
 
1858
 
 
 
 
1859
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1860
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1861
 
 
1865
  df_list = []
1866
 
1867
  for model in MODEL_LIST:
 
 
 
 
 
1868
  try:
1869
+ results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']]
1870
  accuracy = median([results['accuracy'] for results in results_list])
1871
 
1872
+ res = {
1873
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1874
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1875
+ "Accuracy": accuracy,
1876
+ }
 
 
 
 
1877
 
1878
+ df_list.append(res)
1879
 
1880
+ except:
1881
+ print('Not found in model: {} for {}'.format(model, "rte"))
1882
 
1883
  df = pd.DataFrame(df_list)
1884
  # If there are any models that are the same, merge them
 
1903
  RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot")
1904
 
1905
 
 
 
 
 
1906
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1907
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
 
 
1908
  def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):
1909
 
1910
  df_list = []
1911
 
1912
  for model in MODEL_LIST:
1913
+
 
 
 
 
1914
  try:
1915
+ results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']]
1916
  accuracy = median([results['accuracy'] for results in results_list])
1917
 
1918
+ res = {
1919
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1920
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1921
+ "Accuracy": accuracy,
1922
+ }
1923
 
1924
+ df_list.append(res)
 
 
 
 
 
 
1925
 
1926
+ except:
1927
+ print('Not found in model: {} for {}'.format(model, "mrpc"))
1928
 
1929
  df = pd.DataFrame(df_list)
1930
  # If there are any models that are the same, merge them
 
2021
  - **Mode of Evaluation**: Zero-Shot, Five-Shot
2022
 
2023
  ### The following table shows the performance of the models on the SeaEval benchmark.
2024
+ - For **Zero-Shot** performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts.
2025
+ - I am trying to evaluate the base models for five-shot performance and instruction-tuned models for zero-shot.
2026
  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2027
 
2028
  """)
 
2159
 
2160
 
2161
 
2162
+ with gr.TabItem("Cultural Reasoning"):
2163
 
2164
  # dataset 3: SG_EVAL
2165
  with gr.TabItem("SG_EVAL"):
 
2508
  """)
2509
 
2510
 
2511
+ with gr.TabItem("FLORES-Translation"):
2512
 
2513
 
2514
  # dataset 8:
 
2616
  """)
2617
 
2618
 
2619
+ with gr.TabItem("Emotion"):
2620
 
2621
  # dataset 18:
2622
  with gr.TabItem("ind_emotion"):
 
2752
 
2753
 
2754
 
2755
+ with gr.TabItem("Fundamental NLP Tasks"):
2756
 
2757
 
2758
  # dataset