binwang commited on
Commit
ee26773
•
1 Parent(s): 8024fdd
Files changed (1) hide show
  1. app.py +618 -1
app.py CHANGED
@@ -816,6 +816,392 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
816
  FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
817
  FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
820
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
821
 
@@ -1179,7 +1565,7 @@ with block:
1179
 
1180
 
1181
 
1182
- # dataset 10:
1183
  with gr.TabItem("FLORES Malay to English Translation"):
1184
  with gr.Row():
1185
  gr.Markdown("""
@@ -1206,6 +1592,237 @@ with block:
1206
  datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
1207
  type="pandas",
1208
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1209
 
1210
  gr.Markdown(r"""
1211
 
 
816
  FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
817
  FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
818
 
819
+
820
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
821
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
822
+
823
+
824
+ def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):
825
+
826
+ df_list = []
827
+
828
+ for model in MODEL_LIST:
829
+
830
+
831
+ results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
832
+
833
+
834
+ try:
835
+ accuracy = median([results['accuracy'] for results in results_list])
836
+
837
+ except:
838
+ print(results_list)
839
+ accuracy = -1
840
+
841
+
842
+ res = {
843
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
844
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
845
+ "Accuracy": accuracy,
846
+ }
847
+
848
+ df_list.append(res)
849
+
850
+
851
+ df = pd.DataFrame(df_list)
852
+ # If there are any models that are the same, merge them
853
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
854
+ df = df.groupby("Model", as_index=False).first()
855
+ # Put 'Model' column first
856
+ #cols = sorted(list(df.columns))
857
+ cols = list(df.columns)
858
+ cols.insert(0, cols.pop(cols.index("Model")))
859
+ df = df[cols]
860
+
861
+ if rank:
862
+ df = add_rank(df, compute_average=True)
863
+
864
+ if fillna:
865
+ df.fillna("", inplace=True)
866
+
867
+ return df
868
+
869
+
870
+ MMLU_ZERO_SHOT = get_data_mmlu(eval_mode="zero_shot")
871
+ MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot")
872
+
873
+
874
+
875
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
876
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
877
+
878
+
879
+ def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
880
+
881
+ df_list = []
882
+
883
+ for model in MODEL_LIST:
884
+
885
+
886
+ results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
887
+
888
+
889
+ try:
890
+ accuracy = median([results['accuracy'] for results in results_list])
891
+
892
+ except:
893
+ print(results_list)
894
+ accuracy = -1
895
+
896
+
897
+ res = {
898
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
899
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
900
+ "Accuracy": accuracy,
901
+ }
902
+
903
+ df_list.append(res)
904
+
905
+
906
+ df = pd.DataFrame(df_list)
907
+ # If there are any models that are the same, merge them
908
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
909
+ df = df.groupby("Model", as_index=False).first()
910
+ # Put 'Model' column first
911
+ #cols = sorted(list(df.columns))
912
+ cols = list(df.columns)
913
+ cols.insert(0, cols.pop(cols.index("Model")))
914
+ df = df[cols]
915
+
916
+ if rank:
917
+ df = add_rank(df, compute_average=True)
918
+
919
+ if fillna:
920
+ df.fillna("", inplace=True)
921
+
922
+ return df
923
+
924
+
925
+ MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
926
+ MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")
927
+
928
+
929
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
930
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
931
+
932
+
933
+ def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
934
+
935
+ df_list = []
936
+
937
+ for model in MODEL_LIST:
938
+
939
+
940
+ results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
941
+
942
+
943
+ try:
944
+ accuracy = median([results['accuracy'] for results in results_list])
945
+
946
+ except:
947
+ print(results_list)
948
+ accuracy = -1
949
+
950
+
951
+ res = {
952
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
953
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
954
+ "Accuracy": accuracy,
955
+ }
956
+
957
+ df_list.append(res)
958
+
959
+
960
+ df = pd.DataFrame(df_list)
961
+ # If there are any models that are the same, merge them
962
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
963
+ df = df.groupby("Model", as_index=False).first()
964
+ # Put 'Model' column first
965
+ #cols = sorted(list(df.columns))
966
+ cols = list(df.columns)
967
+ cols.insert(0, cols.pop(cols.index("Model")))
968
+ df = df[cols]
969
+
970
+ if rank:
971
+ df = add_rank(df, compute_average=True)
972
+
973
+ if fillna:
974
+ df.fillna("", inplace=True)
975
+
976
+ return df
977
+
978
+
979
+ C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
980
+ C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")
981
+
982
+
983
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
984
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
985
+
986
+
987
+ def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):
988
+
989
+ df_list = []
990
+
991
+ for model in MODEL_LIST:
992
+
993
+
994
+ results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
995
+
996
+
997
+ try:
998
+ accuracy = median([results['accuracy'] for results in results_list])
999
+
1000
+ except:
1001
+ print(results_list)
1002
+ accuracy = -1
1003
+
1004
+
1005
+ res = {
1006
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1007
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1008
+ "Accuracy": accuracy,
1009
+ }
1010
+
1011
+ df_list.append(res)
1012
+
1013
+
1014
+ df = pd.DataFrame(df_list)
1015
+ # If there are any models that are the same, merge them
1016
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1017
+ df = df.groupby("Model", as_index=False).first()
1018
+ # Put 'Model' column first
1019
+ #cols = sorted(list(df.columns))
1020
+ cols = list(df.columns)
1021
+ cols.insert(0, cols.pop(cols.index("Model")))
1022
+ df = df[cols]
1023
+
1024
+ if rank:
1025
+ df = add_rank(df, compute_average=True)
1026
+
1027
+ if fillna:
1028
+ df.fillna("", inplace=True)
1029
+
1030
+ return df
1031
+
1032
+
1033
+ C_EVAL_FULL_ZERO_SHOT = get_data_c_eval_full(eval_mode="zero_shot")
1034
+ C_EVAL_FULL_FIVE_SHOT = get_data_c_eval_full(eval_mode="five_shot")
1035
+
1036
+
1037
+
1038
+
1039
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1040
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1041
+
1042
+
1043
+ def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):
1044
+
1045
+ df_list = []
1046
+
1047
+ for model in MODEL_LIST:
1048
+
1049
+
1050
+ results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
1051
+
1052
+
1053
+ try:
1054
+ accuracy = median([results['accuracy'] for results in results_list])
1055
+
1056
+ except:
1057
+ print(results_list)
1058
+ accuracy = -1
1059
+
1060
+
1061
+ res = {
1062
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1063
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1064
+ "Accuracy": accuracy,
1065
+ }
1066
+
1067
+ df_list.append(res)
1068
+
1069
+
1070
+ df = pd.DataFrame(df_list)
1071
+ # If there are any models that are the same, merge them
1072
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1073
+ df = df.groupby("Model", as_index=False).first()
1074
+ # Put 'Model' column first
1075
+ #cols = sorted(list(df.columns))
1076
+ cols = list(df.columns)
1077
+ cols.insert(0, cols.pop(cols.index("Model")))
1078
+ df = df[cols]
1079
+
1080
+ if rank:
1081
+ df = add_rank(df, compute_average=True)
1082
+
1083
+ if fillna:
1084
+ df.fillna("", inplace=True)
1085
+
1086
+ return df
1087
+
1088
+
1089
+ CMMLU_ZERO_SHOT = get_data_cmmlu(eval_mode="zero_shot")
1090
+ CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot")
1091
+
1092
+
1093
+
1094
+
1095
+
1096
+
1097
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1098
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1099
+
1100
+
1101
+ def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
1102
+
1103
+ df_list = []
1104
+
1105
+ for model in MODEL_LIST:
1106
+
1107
+
1108
+ results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
1109
+
1110
+
1111
+ try:
1112
+ accuracy = median([results['accuracy'] for results in results_list])
1113
+
1114
+ except:
1115
+ print(results_list)
1116
+ accuracy = -1
1117
+
1118
+
1119
+ res = {
1120
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1121
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1122
+ "Accuracy": accuracy,
1123
+ }
1124
+
1125
+ df_list.append(res)
1126
+
1127
+
1128
+ df = pd.DataFrame(df_list)
1129
+ # If there are any models that are the same, merge them
1130
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1131
+ df = df.groupby("Model", as_index=False).first()
1132
+ # Put 'Model' column first
1133
+ #cols = sorted(list(df.columns))
1134
+ cols = list(df.columns)
1135
+ cols.insert(0, cols.pop(cols.index("Model")))
1136
+ df = df[cols]
1137
+
1138
+ if rank:
1139
+ df = add_rank(df, compute_average=True)
1140
+
1141
+ if fillna:
1142
+ df.fillna("", inplace=True)
1143
+
1144
+ return df
1145
+
1146
+
1147
+ CMMLU_FULL_ZERO_SHOT = get_data_cmmlu_full(eval_mode="zero_shot")
1148
+ CMMLU_FULL_FIVE_SHOT = get_data_cmmlu_full(eval_mode="five_shot")
1149
+
1150
+
1151
+
1152
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1153
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1154
+
1155
+
1156
+ def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):
1157
+
1158
+ df_list = []
1159
+
1160
+ for model in MODEL_LIST:
1161
+
1162
+
1163
+ results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
1164
+
1165
+
1166
+ try:
1167
+ accuracy = median([results['accuracy'] for results in results_list])
1168
+
1169
+ except:
1170
+ print(results_list)
1171
+ accuracy = -1
1172
+
1173
+
1174
+ res = {
1175
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1176
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1177
+ "Accuracy": accuracy,
1178
+ }
1179
+
1180
+ df_list.append(res)
1181
+
1182
+
1183
+ df = pd.DataFrame(df_list)
1184
+ # If there are any models that are the same, merge them
1185
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1186
+ df = df.groupby("Model", as_index=False).first()
1187
+ # Put 'Model' column first
1188
+ #cols = sorted(list(df.columns))
1189
+ cols = list(df.columns)
1190
+ cols.insert(0, cols.pop(cols.index("Model")))
1191
+ df = df[cols]
1192
+
1193
+ if rank:
1194
+ df = add_rank(df, compute_average=True)
1195
+
1196
+ if fillna:
1197
+ df.fillna("", inplace=True)
1198
+
1199
+ return df
1200
+
1201
+
1202
+ ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot")
1203
+ ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot")
1204
+
1205
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1206
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1207
 
 
1565
 
1566
 
1567
 
1568
+ # dataset 11:
1569
  with gr.TabItem("FLORES Malay to English Translation"):
1570
  with gr.Row():
1571
  gr.Markdown("""
 
1592
  datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
1593
  type="pandas",
1594
  )
1595
+
1596
+
1597
+ # dataset 12:
1598
+ with gr.TabItem("MMLU"):
1599
+ with gr.Row():
1600
+ gr.Markdown("""
1601
+ **MMLU Leaderboard** 🔮
1602
+
1603
+ - **Metric:** Accuracy.
1604
+ - **Languages:** English
1605
+ """)
1606
+
1607
+ with gr.TabItem("zero_shot"):
1608
+ with gr.TabItem("Overall"):
1609
+ with gr.Row():
1610
+ gr.components.Dataframe(
1611
+ MMLU_ZERO_SHOT,
1612
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
1613
+ type="pandas",
1614
+ )
1615
+
1616
+ with gr.TabItem("five_shot"):
1617
+ with gr.TabItem("Overall"):
1618
+ with gr.Row():
1619
+ gr.components.Dataframe(
1620
+ MMLU_FIVE_SHOT,
1621
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
1622
+ type="pandas",
1623
+ )
1624
+
1625
+
1626
+ # dataset 13:
1627
+ with gr.TabItem("MMLU Full"):
1628
+ with gr.Row():
1629
+ gr.Markdown("""
1630
+ **MMLU Full Leaderboard** 🔮
1631
+
1632
+ - **Metric:** Accuracy.
1633
+ - **Languages:** English
1634
+ """)
1635
+
1636
+ with gr.TabItem("zero_shot"):
1637
+ with gr.TabItem("Overall"):
1638
+ with gr.Row():
1639
+ gr.components.Dataframe(
1640
+ MMLU_FULL_ZERO_SHOT,
1641
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
1642
+ type="pandas",
1643
+ )
1644
+
1645
+
1646
+
1647
+ with gr.TabItem("five_shot"):
1648
+ with gr.TabItem("Overall"):
1649
+ with gr.Row():
1650
+ gr.components.Dataframe(
1651
+ MMLU_FULL_FIVE_SHOT,
1652
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
1653
+ type="pandas",
1654
+ )
1655
+
1656
+ # dataset 14:
1657
+ with gr.TabItem("C_EVAL"):
1658
+ with gr.Row():
1659
+ gr.Markdown("""
1660
+ **C_EVAL Leaderboard** 🔮
1661
+
1662
+ - **Metric:** Accuracy.
1663
+ - **Languages:** Chinese
1664
+ """)
1665
+
1666
+ with gr.TabItem("zero_shot"):
1667
+ with gr.TabItem("Overall"):
1668
+ with gr.Row():
1669
+ gr.components.Dataframe(
1670
+ C_EVAL_ZERO_SHOT,
1671
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
1672
+ type="pandas",
1673
+ )
1674
+
1675
+
1676
+
1677
+ with gr.TabItem("five_shot"):
1678
+ with gr.TabItem("Overall"):
1679
+ with gr.Row():
1680
+ gr.components.Dataframe(
1681
+ C_EVAL_FIVE_SHOT,
1682
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
1683
+ type="pandas",
1684
+ )
1685
+
1686
+
1687
+ # dataset 15:
1688
+ with gr.TabItem("C_EVAL Full"):
1689
+ with gr.Row():
1690
+ gr.Markdown("""
1691
+ **C_EVAL Full Leaderboard** 🔮
1692
+
1693
+ - **Metric:** Accuracy.
1694
+ - **Languages:** Chinese
1695
+ """)
1696
+
1697
+ with gr.TabItem("zero_shot"):
1698
+ with gr.TabItem("Overall"):
1699
+ with gr.Row():
1700
+ gr.components.Dataframe(
1701
+ C_EVAL_FULL_ZERO_SHOT,
1702
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
1703
+ type="pandas",
1704
+ )
1705
+
1706
+
1707
+
1708
+ with gr.TabItem("five_shot"):
1709
+ with gr.TabItem("Overall"):
1710
+ with gr.Row():
1711
+ gr.components.Dataframe(
1712
+ C_EVAL_FULL_FIVE_SHOT,
1713
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
1714
+ type="pandas",
1715
+ )
1716
+
1717
+ # dataset 16:
1718
+ with gr.TabItem("CMMLU"):
1719
+ with gr.Row():
1720
+ gr.Markdown("""
1721
+ **CMMLU Leaderboard** 🔮
1722
+
1723
+ - **Metric:** Accuracy.
1724
+ - **Languages:** Chinese
1725
+ """)
1726
+
1727
+ with gr.TabItem("zero_shot"):
1728
+ with gr.TabItem("Overall"):
1729
+ with gr.Row():
1730
+ gr.components.Dataframe(
1731
+ CMMLU_ZERO_SHOT,
1732
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
1733
+ type="pandas",
1734
+ )
1735
+
1736
+
1737
+
1738
+ with gr.TabItem("five_shot"):
1739
+ with gr.TabItem("Overall"):
1740
+ with gr.Row():
1741
+ gr.components.Dataframe(
1742
+ CMMLU_FIVE_SHOT,
1743
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
1744
+ type="pandas",
1745
+ )
1746
+
1747
+ # dataset 17:
1748
+ with gr.TabItem("CMMLU Full"):
1749
+ with gr.Row():
1750
+ gr.Markdown("""
1751
+ **CMMLU Full Leaderboard** 🔮
1752
+
1753
+ - **Metric:** Accuracy.
1754
+ - **Languages:** Chinese
1755
+ """)
1756
+
1757
+ with gr.TabItem("zero_shot"):
1758
+ with gr.TabItem("Overall"):
1759
+ with gr.Row():
1760
+ gr.components.Dataframe(
1761
+ CMMLU_FULL_ZERO_SHOT,
1762
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
1763
+ type="pandas",
1764
+ )
1765
+
1766
+
1767
+
1768
+ with gr.TabItem("five_shot"):
1769
+ with gr.TabItem("Overall"):
1770
+ with gr.Row():
1771
+ gr.components.Dataframe(
1772
+ CMMLU_FULL_FIVE_SHOT,
1773
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
1774
+ type="pandas",
1775
+ )
1776
+
1777
+ # dataset 18:
1778
+ with gr.TabItem("ZBench"):
1779
+ with gr.Row():
1780
+ gr.Markdown("""
1781
+ **ZBench Leaderboard** 🔮
1782
+
1783
+ - **Metric:** Accuracy.
1784
+ - **Languages:** Chinese
1785
+ """)
1786
+
1787
+ with gr.TabItem("zero_shot"):
1788
+ with gr.TabItem("Overall"):
1789
+ with gr.Row():
1790
+ gr.components.Dataframe(
1791
+ ZBENCH_ZERO_SHOT,
1792
+ datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
1793
+ type="pandas",
1794
+ )
1795
+
1796
+
1797
+
1798
+ with gr.TabItem("five_shot"):
1799
+ with gr.TabItem("Overall"):
1800
+ with gr.Row():
1801
+ gr.components.Dataframe(
1802
+ ZBENCH_FIVE_SHOT,
1803
+ datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
1804
+ type="pandas",
1805
+ )
1806
+
1807
+
1808
+
1809
+
1810
+
1811
+
1812
+
1813
+
1814
+
1815
+
1816
+
1817
+
1818
+
1819
+
1820
+
1821
+
1822
+
1823
+
1824
+
1825
+
1826
 
1827
  gr.Markdown(r"""
1828