rodrigomasini commited on
Commit
e1e11ec
1 Parent(s): 2ad8c60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -863
app.py CHANGED
@@ -929,21 +929,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
929
  meta = metadata_load(readme_path)
930
  if "model-index" not in meta:
931
  continue
932
- # meta['model-index'][0]["results"] is list of elements like:
933
- # {
934
- # "task": {"type": "Classification"},
935
- # "dataset": {
936
- # "type": "mteb/amazon_massive_intent",
937
- # "name": "MTEB MassiveIntentClassification (nb)",
938
- # "config": "nb",
939
- # "split": "test",
940
- # },
941
- # "metrics": [
942
- # {"type": "accuracy", "value": 39.81506388702084},
943
- # {"type": "f1", "value": 38.809586587791664},
944
- # ],
945
- # },
946
- # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
947
  if len(datasets) > 0:
948
  task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
949
  elif langs:
@@ -977,7 +963,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
977
  return df
978
 
979
  def get_mteb_average():
980
- global DATA_OVERALL, DATA_CLASSIFICATION_EN, DATA_CLUSTERING, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_SUMMARIZATION
981
  DATA_OVERALL = get_mteb_data(
982
  tasks=[
983
  "Classification",
@@ -1010,28 +996,6 @@ def get_mteb_average():
1010
 
1011
  DATA_OVERALL = DATA_OVERALL.round(2)
1012
 
1013
- DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION])
1014
- # Only keep rows with at least one score in addition to the "Model" & rank column
1015
- DATA_CLASSIFICATION_EN = DATA_CLASSIFICATION_EN[DATA_CLASSIFICATION_EN.iloc[:, 2:].ne("").any(axis=1)]
1016
-
1017
- DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING])
1018
- DATA_CLUSTERING = DATA_CLUSTERING[DATA_CLUSTERING.iloc[:, 2:].ne("").any(axis=1)]
1019
-
1020
- DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION])
1021
- DATA_PAIR_CLASSIFICATION = DATA_PAIR_CLASSIFICATION[DATA_PAIR_CLASSIFICATION.iloc[:, 2:].ne("").any(axis=1)]
1022
-
1023
- DATA_RERANKING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RERANKING])
1024
- DATA_RERANKING = DATA_RERANKING[DATA_RERANKING.iloc[:, 2:].ne("").any(axis=1)]
1025
-
1026
- DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL])
1027
- DATA_RETRIEVAL = DATA_RETRIEVAL[DATA_RETRIEVAL.iloc[:, 2:].ne("").any(axis=1)]
1028
-
1029
- DATA_STS_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_STS])
1030
- DATA_STS_EN = DATA_STS_EN[DATA_STS_EN.iloc[:, 2:].ne("").any(axis=1)]
1031
-
1032
- DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION])
1033
- DATA_SUMMARIZATION = DATA_SUMMARIZATION[DATA_SUMMARIZATION.iloc[:, 1:].ne("").any(axis=1)]
1034
-
1035
  # Fill NaN after averaging
1036
  DATA_OVERALL.fillna("", inplace=True)
1037
 
@@ -1040,838 +1004,96 @@ def get_mteb_average():
1040
 
1041
  return DATA_OVERALL
1042
 
1043
- def get_mteb_average_zh():
1044
- global DATA_OVERALL_ZH, DATA_CLASSIFICATION_ZH, DATA_CLUSTERING_ZH, DATA_PAIR_CLASSIFICATION_ZH, DATA_RERANKING_ZH, DATA_RETRIEVAL_ZH, DATA_STS_ZH
1045
- DATA_OVERALL_ZH = get_mteb_data(
1046
- tasks=[
1047
- "Classification",
1048
- "Clustering",
1049
- "PairClassification",
1050
- "Reranking",
1051
- "Retrieval",
1052
- "STS",
1053
- ],
1054
- datasets=TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH,
1055
- fillna=False,
1056
- add_emb_dim=True,
1057
- rank=False,
1058
- )
1059
- # Debugging:
1060
- # DATA_OVERALL_ZH.to_csv("overall.csv")
1061
-
1062
- DATA_OVERALL_ZH.insert(1, f"Average ({len(TASK_LIST_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_ZH].mean(axis=1, skipna=False))
1063
- DATA_OVERALL_ZH.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLASSIFICATION_ZH].mean(axis=1, skipna=False))
1064
- DATA_OVERALL_ZH.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLUSTERING_ZH].mean(axis=1, skipna=False))
1065
- DATA_OVERALL_ZH.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_PAIR_CLASSIFICATION_ZH].mean(axis=1, skipna=False))
1066
- DATA_OVERALL_ZH.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RERANKING_ZH].mean(axis=1, skipna=False))
1067
- DATA_OVERALL_ZH.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RETRIEVAL_ZH].mean(axis=1, skipna=False))
1068
- DATA_OVERALL_ZH.insert(7, f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_STS_ZH].mean(axis=1, skipna=False))
1069
- DATA_OVERALL_ZH.sort_values(f"Average ({len(TASK_LIST_ZH)} datasets)", ascending=False, inplace=True)
1070
- # Start ranking from 1
1071
- DATA_OVERALL_ZH.insert(0, "Rank", list(range(1, len(DATA_OVERALL_ZH) + 1)))
1072
-
1073
- DATA_OVERALL_ZH = DATA_OVERALL_ZH.round(2)
1074
-
1075
- DATA_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_CLASSIFICATION_ZH])
1076
- # Only keep rows with at least one score in addition to the "Model" & rank column
1077
- DATA_CLASSIFICATION_ZH = DATA_CLASSIFICATION_ZH[DATA_CLASSIFICATION_ZH.iloc[:, 2:].ne("").any(axis=1)]
1078
-
1079
- DATA_CLUSTERING_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_CLUSTERING_ZH])
1080
- DATA_CLUSTERING_ZH = DATA_CLUSTERING_ZH[DATA_CLUSTERING_ZH.iloc[:, 2:].ne("").any(axis=1)]
1081
-
1082
- DATA_PAIR_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_PAIR_CLASSIFICATION_ZH])
1083
- DATA_PAIR_CLASSIFICATION_ZH = DATA_PAIR_CLASSIFICATION_ZH[DATA_PAIR_CLASSIFICATION_ZH.iloc[:, 2:].ne("").any(axis=1)]
1084
-
1085
- DATA_RERANKING_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_RERANKING_ZH])
1086
- DATA_RERANKING_ZH = DATA_RERANKING_ZH[DATA_RERANKING_ZH.iloc[:, 2:].ne("").any(axis=1)]
1087
-
1088
- DATA_RETRIEVAL_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_RETRIEVAL_ZH])
1089
- DATA_RETRIEVAL_ZH = DATA_RETRIEVAL_ZH[DATA_RETRIEVAL_ZH.iloc[:, 2:].ne("").any(axis=1)]
1090
-
1091
- DATA_STS_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_STS_ZH])
1092
- DATA_STS_ZH = DATA_STS_ZH[DATA_STS_ZH.iloc[:, 2:].ne("").any(axis=1)]
1093
-
1094
- # Fill NaN after averaging
1095
- DATA_OVERALL_ZH.fillna("", inplace=True)
1096
-
1097
- DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_ZH)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)"]]
1098
- DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)]
1099
-
1100
- return DATA_OVERALL_ZH
1101
-
1102
- def get_mteb_average_pl():
1103
- global DATA_OVERALL_PL, DATA_CLASSIFICATION_PL, DATA_CLUSTERING_PL, DATA_PAIR_CLASSIFICATION_PL, DATA_RETRIEVAL_PL, DATA_STS_PL
1104
- DATA_OVERALL_PL = get_mteb_data(
1105
- tasks=[
1106
- "Classification",
1107
- "Clustering",
1108
- "PairClassification",
1109
- "Retrieval",
1110
- "STS",
1111
- ],
1112
- datasets=TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL,
1113
- fillna=False,
1114
- add_emb_dim=True,
1115
- rank=False,
1116
- )
1117
- # Debugging:
1118
- # DATA_OVERALL_PL.to_csv("overall.csv")
1119
-
1120
- DATA_OVERALL_PL.insert(1, f"Average ({len(TASK_LIST_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PL].mean(axis=1, skipna=False))
1121
- DATA_OVERALL_PL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLASSIFICATION_PL].mean(axis=1, skipna=False))
1122
- DATA_OVERALL_PL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLUSTERING_PL].mean(axis=1, skipna=False))
1123
- DATA_OVERALL_PL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PAIR_CLASSIFICATION_PL].mean(axis=1, skipna=False))
1124
- DATA_OVERALL_PL.insert(5, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_RETRIEVAL_PL].mean(axis=1, skipna=False))
1125
- DATA_OVERALL_PL.insert(6, f"STS Average ({len(TASK_LIST_STS_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_STS_PL].mean(axis=1, skipna=False))
1126
- DATA_OVERALL_PL.sort_values(f"Average ({len(TASK_LIST_PL)} datasets)", ascending=False, inplace=True)
1127
- # Start ranking from 1
1128
- DATA_OVERALL_PL.insert(0, "Rank", list(range(1, len(DATA_OVERALL_PL) + 1)))
1129
-
1130
- DATA_OVERALL_PL = DATA_OVERALL_PL.round(2)
1131
-
1132
- DATA_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLASSIFICATION_PL])
1133
- # Only keep rows with at least one score in addition to the "Model" & rank column
1134
- DATA_CLASSIFICATION_PL = DATA_CLASSIFICATION_PL[DATA_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
1135
-
1136
- DATA_CLUSTERING_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLUSTERING_PL])
1137
- DATA_CLUSTERING_PL = DATA_CLUSTERING_PL[DATA_CLUSTERING_PL.iloc[:, 2:].ne("").any(axis=1)]
1138
-
1139
- DATA_PAIR_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION_PL])
1140
- DATA_PAIR_CLASSIFICATION_PL = DATA_PAIR_CLASSIFICATION_PL[DATA_PAIR_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
1141
-
1142
- DATA_RETRIEVAL_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_RETRIEVAL_PL])
1143
- DATA_RETRIEVAL_PL = DATA_RETRIEVAL_PL[DATA_RETRIEVAL_PL.iloc[:, 2:].ne("").any(axis=1)]
1144
-
1145
- DATA_STS_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_STS_PL])
1146
- DATA_STS_PL = DATA_STS_PL[DATA_STS_PL.iloc[:, 2:].ne("").any(axis=1)]
1147
-
1148
- # Fill NaN after averaging
1149
- DATA_OVERALL_PL.fillna("", inplace=True)
1150
-
1151
- DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]]
1152
- DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
1153
-
1154
- return DATA_OVERALL_PL
1155
-
1156
  get_mteb_average()
1157
- get_mteb_average_pl()
1158
- get_mteb_average_zh()
1159
- DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
1160
- DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
1161
- DATA_CLASSIFICATION_DA = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_DA)
1162
- DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_NB)
1163
- DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
1164
- DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
1165
- DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
1166
- DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)
1167
-
1168
- # Exact, add all non-nan integer values for every dataset
1169
- NUM_SCORES = 0
1170
- DATASETS = []
1171
- MODELS = []
1172
- # LANGUAGES = []
1173
- for d in [
1174
- DATA_BITEXT_MINING,
1175
- DATA_BITEXT_MINING_OTHER,
1176
- DATA_CLASSIFICATION_EN,
1177
- DATA_CLASSIFICATION_DA,
1178
- DATA_CLASSIFICATION_NB,
1179
- DATA_CLASSIFICATION_PL,
1180
- DATA_CLASSIFICATION_SV,
1181
- DATA_CLASSIFICATION_ZH,
1182
- DATA_CLASSIFICATION_OTHER,
1183
- DATA_CLUSTERING,
1184
- DATA_CLUSTERING_DE,
1185
- DATA_CLUSTERING_PL,
1186
- DATA_CLUSTERING_ZH,
1187
- DATA_PAIR_CLASSIFICATION,
1188
- DATA_PAIR_CLASSIFICATION_PL,
1189
- DATA_PAIR_CLASSIFICATION_ZH,
1190
- DATA_RERANKING,
1191
- DATA_RERANKING_ZH,
1192
- DATA_RETRIEVAL,
1193
- DATA_RETRIEVAL_PL,
1194
- DATA_RETRIEVAL_ZH,
1195
- DATA_STS_EN,
1196
- DATA_STS_PL,
1197
- DATA_STS_ZH,
1198
- DATA_STS_OTHER,
1199
- DATA_SUMMARIZATION,
1200
- ]:
1201
- # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
1202
- cols_to_ignore = 3 if "Average" in d.columns else 2
1203
- # Count number of scores including only non-nan floats & excluding the rank column
1204
- NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
1205
- # Exclude rank & model name column (first two); Do not count different language versions as different datasets
1206
- DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
1207
- # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
1208
- MODELS += d["Model"].tolist()
1209
 
1210
  NUM_DATASETS = len(set(DATASETS))
1211
  # NUM_LANGUAGES = len(set(LANGUAGES))
1212
  NUM_MODELS = len(set(MODELS))
1213
 
1214
- # 1. Force headers to wrap
1215
- # 2. Force model column (maximum) width
1216
- # 3. Prevent model column from overflowing, scroll instead
1217
- css = """
1218
- table > thead {
1219
- white-space: normal
1220
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1221
 
1222
- table {
1223
- --cell-width-1: 210px
1224
- }
1225
 
1226
- table > tbody > tr > td:nth-child(2) > div {
1227
- overflow-x: auto
1228
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1229
  """
1230
 
1231
- block = gr.Blocks(css=css)
1232
- with block:
1233
- gr.Markdown(f"""
1234
- Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
1235
- """)
1236
- with gr.Tabs():
1237
- with gr.TabItem("Overall"):
1238
- with gr.TabItem("English"):
1239
- with gr.Row():
1240
- gr.Markdown("""
1241
- **Overall MTEB English leaderboard** 🔮
1242
-
1243
- - **Metric:** Various, refer to task tabs
1244
- - **Languages:** English
1245
- """)
1246
- with gr.Row():
1247
- data_overall = gr.components.Dataframe(
1248
- DATA_OVERALL,
1249
- datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns),
1250
- type="pandas",
1251
- height=600,
1252
- )
1253
- with gr.Row():
1254
- data_run_overall = gr.Button("Refresh")
1255
- data_run_overall.click(get_mteb_average, inputs=None, outputs=data_overall)
1256
- with gr.TabItem("Chinese"):
1257
- with gr.Row():
1258
- gr.Markdown("""
1259
- **Overall MTEB Chinese leaderboard (C-MTEB)** 🔮🇨🇳
1260
-
1261
- - **Metric:** Various, refer to task tabs
1262
- - **Languages:** Chinese
1263
- - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1264
- """)
1265
- with gr.Row():
1266
- data_overall_zh = gr.components.Dataframe(
1267
- DATA_OVERALL_ZH,
1268
- datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_ZH.columns),
1269
- type="pandas",
1270
- height=600,
1271
- )
1272
- with gr.Row():
1273
- data_run_overall_zh = gr.Button("Refresh")
1274
- data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
1275
- with gr.TabItem("Polish"):
1276
- with gr.Row():
1277
- gr.Markdown("""
1278
- **Overall MTEB Polish leaderboard (PL-MTEB)** 🔮🇵🇱
1279
-
1280
- - **Metric:** Various, refer to task tabs
1281
- - **Languages:** Polish
1282
- - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata), [Konrad Wojtasik](https://github.com/kwojtasi) & [BEIR-PL](https://arxiv.org/abs/2305.19840)
1283
- """)
1284
- with gr.Row():
1285
- data_overall_pl = gr.components.Dataframe(
1286
- DATA_OVERALL_PL,
1287
- datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_PL.columns),
1288
- type="pandas",
1289
- height=600,
1290
- )
1291
- with gr.Row():
1292
- data_run_overall_pl = gr.Button("Refresh")
1293
- data_run_overall_pl.click(get_mteb_average_pl, inputs=None, outputs=data_overall_pl)
1294
- with gr.TabItem("Bitext Mining"):
1295
- with gr.TabItem("English-X"):
1296
- with gr.Row():
1297
- gr.Markdown("""
1298
- **Bitext Mining English-X Leaderboard** 🎌
1299
-
1300
- - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
1301
- - **Languages:** 117 (Pairs of: English & other language)
1302
- """)
1303
- with gr.Row():
1304
- data_bitext_mining = gr.components.Dataframe(
1305
- DATA_BITEXT_MINING,
1306
- datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING.columns),
1307
- type="pandas",
1308
- )
1309
- with gr.Row():
1310
- data_run_bitext_mining = gr.Button("Refresh")
1311
- data_run_bitext_mining.click(
1312
- partial(get_mteb_data, tasks=["BitextMining"], datasets=TASK_LIST_BITEXT_MINING),
1313
- outputs=data_bitext_mining,
1314
- )
1315
- with gr.TabItem("Danish"):
1316
- with gr.Row():
1317
- gr.Markdown("""
1318
- **Bitext Mining Danish Leaderboard** 🎌🇩🇰
1319
-
1320
- - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
1321
- - **Languages:** Danish & Bornholmsk (Danish Dialect)
1322
- - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1323
- """)
1324
- with gr.Row():
1325
- data_bitext_mining_da = gr.components.Dataframe(
1326
- DATA_BITEXT_MINING_OTHER,
1327
- datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
1328
- type="pandas",
1329
- )
1330
- with gr.Row():
1331
- data_run_bitext_mining_da = gr.Button("Refresh")
1332
- data_run_bitext_mining_da.click(
1333
- partial(get_mteb_data, tasks=["BitextMining"], datasets=TASK_LIST_BITEXT_MINING_OTHER),
1334
- outputs=data_bitext_mining_da,
1335
- )
1336
- with gr.TabItem("Classification"):
1337
- with gr.TabItem("English"):
1338
- with gr.Row():
1339
- gr.Markdown("""
1340
- **Classification English Leaderboard** ❤️
1341
-
1342
- - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1343
- - **Languages:** English
1344
- """)
1345
- with gr.Row():
1346
- data_classification_en = gr.components.Dataframe(
1347
- DATA_CLASSIFICATION_EN,
1348
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
1349
- type="pandas",
1350
- )
1351
- with gr.Row():
1352
- data_run_classification_en = gr.Button("Refresh")
1353
- data_run_classification_en.click(
1354
- partial(get_mteb_data, tasks=["Classification"], langs=["en"]),
1355
- outputs=data_classification_en,
1356
- )
1357
- with gr.TabItem("Chinese"):
1358
- with gr.Row():
1359
- gr.Markdown("""
1360
- **Classification Chinese Leaderboard** 🧡🇨🇳
1361
-
1362
- - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1363
- - **Languages:** Chinese
1364
- - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1365
- """)
1366
- with gr.Row():
1367
- data_classification_zh = gr.components.Dataframe(
1368
- DATA_CLASSIFICATION_ZH,
1369
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_ZH.columns),
1370
- type="pandas",
1371
- )
1372
- with gr.Row():
1373
- data_run_classification_zh = gr.Button("Refresh")
1374
- data_run_classification_zh.click(
1375
- partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_ZH),
1376
- outputs=data_classification_zh,
1377
- )
1378
- with gr.TabItem("Danish"):
1379
- with gr.Row():
1380
- gr.Markdown("""
1381
- **Classification Danish Leaderboard** 🤍🇩🇰
1382
-
1383
- - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1384
- - **Languages:** Danish
1385
- - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1386
- """)
1387
- with gr.Row():
1388
- data_classification_da = gr.components.Dataframe(
1389
- DATA_CLASSIFICATION_DA,
1390
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_DA.columns),
1391
- type="pandas",
1392
- )
1393
- with gr.Row():
1394
- data_run_classification_da = gr.Button("Refresh")
1395
- data_run_classification_da.click(
1396
- partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_DA),
1397
- outputs=data_run_classification_da,
1398
- )
1399
- with gr.TabItem("Norwegian"):
1400
- with gr.Row():
1401
- gr.Markdown("""
1402
- **Classification Norwegian Leaderboard** 💙🇳🇴
1403
-
1404
- - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1405
- - **Languages:** Norwegian Bokmål
1406
- - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1407
- """)
1408
- with gr.Row():
1409
- data_classification_nb = gr.components.Dataframe(
1410
- DATA_CLASSIFICATION_NB,
1411
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_NB.columns),
1412
- type="pandas",
1413
- )
1414
- with gr.Row():
1415
- data_run_classification_nb = gr.Button("Refresh")
1416
- data_run_classification_nb.click(
1417
- partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_NB),
1418
- outputs=data_classification_nb,
1419
- )
1420
- with gr.TabItem("Polish"):
1421
- with gr.Row():
1422
- gr.Markdown("""
1423
- **Classification Polish Leaderboard** 🤍🇵🇱
1424
-
1425
- - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1426
- - **Languages:** Polish
1427
- - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1428
- """)
1429
- with gr.Row():
1430
- data_classification_pl = gr.components.Dataframe(
1431
- DATA_CLASSIFICATION_PL,
1432
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_PL.columns),
1433
- type="pandas",
1434
- )
1435
- with gr.Row():
1436
- data_run_classification_pl = gr.Button("Refresh")
1437
- data_run_classification_pl.click(
1438
- partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_PL),
1439
- outputs=data_classification_pl,
1440
- )
1441
- with gr.TabItem("Swedish"):
1442
- with gr.Row():
1443
- gr.Markdown("""
1444
- **Classification Swedish Leaderboard** 💛🇸🇪
1445
-
1446
- - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1447
- - **Languages:** Swedish
1448
- - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1449
- """)
1450
- with gr.Row():
1451
- data_classification_sv = gr.components.Dataframe(
1452
- DATA_CLASSIFICATION_SV,
1453
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_SV.columns),
1454
- type="pandas",
1455
- )
1456
- with gr.Row():
1457
- data_run_classification_sv = gr.Button("Refresh")
1458
- data_run_classification_sv.click(
1459
- partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_SV),
1460
- outputs=data_classification_sv,
1461
- )
1462
- with gr.TabItem("Other"):
1463
- with gr.Row():
1464
- gr.Markdown("""
1465
- **Classification Other Languages Leaderboard** 💜💚💙
1466
-
1467
- - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1468
- - **Languages:** 47 (Only languages not included in the other tabs)
1469
- """)
1470
- with gr.Row():
1471
- data_classification = gr.components.Dataframe(
1472
- DATA_CLASSIFICATION_OTHER,
1473
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_OTHER) * 10,
1474
- type="pandas",
1475
- )
1476
- with gr.Row():
1477
- data_run_classification = gr.Button("Refresh")
1478
- data_run_classification.click(
1479
- partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_OTHER),
1480
- outputs=data_classification,
1481
- )
1482
- with gr.TabItem("Clustering"):
1483
- with gr.TabItem("English"):
1484
- with gr.Row():
1485
- gr.Markdown("""
1486
- **Clustering Leaderboard** ✨
1487
-
1488
- - **Metric:** Validity Measure (v_measure)
1489
- - **Languages:** English
1490
- """)
1491
- with gr.Row():
1492
- data_clustering = gr.components.Dataframe(
1493
- DATA_CLUSTERING,
1494
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
1495
- type="pandas",
1496
- )
1497
- with gr.Row():
1498
- data_run_clustering_en = gr.Button("Refresh")
1499
- data_run_clustering_en.click(
1500
- partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING),
1501
- outputs=data_clustering,
1502
- )
1503
- with gr.TabItem("Chinese"):
1504
- with gr.Row():
1505
- gr.Markdown("""
1506
- **Clustering Chinese Leaderboard** ✨🇨🇳
1507
-
1508
- - **Metric:** Validity Measure (v_measure)
1509
- - **Languages:** Chinese
1510
- - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1511
- """)
1512
- with gr.Row():
1513
- data_clustering_zh = gr.components.Dataframe(
1514
- DATA_CLUSTERING_ZH,
1515
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_ZH.columns),
1516
- type="pandas",
1517
- )
1518
- with gr.Row():
1519
- data_run_clustering_zh = gr.Button("Refresh")
1520
- data_run_clustering_zh.click(
1521
- partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_ZH),
1522
- outputs=data_clustering_zh,
1523
- )
1524
- with gr.TabItem("German"):
1525
- with gr.Row():
1526
- gr.Markdown("""
1527
- **Clustering German Leaderboard** ✨🇩🇪
1528
-
1529
- - **Metric:** Validity Measure (v_measure)
1530
- - **Languages:** German
1531
- - **Credits:** [Silvan](https://github.com/slvnwhrl)
1532
- """)
1533
- with gr.Row():
1534
- data_clustering_de = gr.components.Dataframe(
1535
- DATA_CLUSTERING_DE,
1536
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_DE.columns) * 2,
1537
- type="pandas",
1538
- )
1539
- with gr.Row():
1540
- data_run_clustering_de = gr.Button("Refresh")
1541
- data_run_clustering_de.click(
1542
- partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_DE),
1543
- outputs=data_clustering_de,
1544
- )
1545
- with gr.TabItem("Polish"):
1546
- with gr.Row():
1547
- gr.Markdown("""
1548
- **Clustering Polish Leaderboard** ✨🇵🇱
1549
-
1550
- - **Metric:** Validity Measure (v_measure)
1551
- - **Languages:** Polish
1552
- - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1553
- """)
1554
- with gr.Row():
1555
- data_clustering_pl = gr.components.Dataframe(
1556
- DATA_CLUSTERING_PL,
1557
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_PL.columns) * 2,
1558
- type="pandas",
1559
- )
1560
- with gr.Row():
1561
- data_run_clustering_pl = gr.Button("Refresh")
1562
- data_run_clustering_pl.click(
1563
- partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_PL),
1564
- outputs=data_clustering_pl,
1565
- )
1566
- with gr.TabItem("Pair Classification"):
1567
- with gr.TabItem("English"):
1568
- with gr.Row():
1569
- gr.Markdown("""
1570
- **Pair Classification English Leaderboard** 🎭
1571
-
1572
- - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1573
- - **Languages:** English
1574
- """)
1575
- with gr.Row():
1576
- data_pair_classification = gr.components.Dataframe(
1577
- DATA_PAIR_CLASSIFICATION,
1578
- datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
1579
- type="pandas",
1580
- )
1581
- with gr.Row():
1582
- data_run_pair_classification = gr.Button("Refresh")
1583
- data_run_pair_classification.click(
1584
- partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION),
1585
- outputs=data_pair_classification,
1586
- )
1587
- with gr.TabItem("Chinese"):
1588
- with gr.Row():
1589
- gr.Markdown("""
1590
- **Pair Classification Chinese Leaderboard** 🎭🇨🇳
1591
-
1592
- - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1593
- - **Languages:** Chinese
1594
- - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1595
- """)
1596
- with gr.Row():
1597
- data_pair_classification_zh = gr.components.Dataframe(
1598
- DATA_PAIR_CLASSIFICATION_ZH,
1599
- datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION_ZH.columns),
1600
- type="pandas",
1601
- )
1602
- with gr.Row():
1603
- data_run_pair_classification_zh = gr.Button("Refresh")
1604
- data_run_pair_classification_zh.click(
1605
- partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION_ZH),
1606
- outputs=data_pair_classification_zh,
1607
- )
1608
- with gr.TabItem("Polish"):
1609
- with gr.Row():
1610
- gr.Markdown("""
1611
- **Pair Classification Polish Leaderboard** 🎭🇵🇱
1612
-
1613
- - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1614
- - **Languages:** Polish
1615
- - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1616
- """)
1617
- with gr.Row():
1618
- data_pair_classification_pl = gr.components.Dataframe(
1619
- DATA_PAIR_CLASSIFICATION_PL,
1620
- datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION_PL.columns),
1621
- type="pandas",
1622
- )
1623
- with gr.Row():
1624
- data_run_pair_classification_pl = gr.Button("Refresh")
1625
- data_run_pair_classification_pl.click(
1626
- partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION_PL),
1627
- outputs=data_pair_classification_pl,
1628
- )
1629
- with gr.TabItem("Reranking"):
1630
- with gr.TabItem("English"):
1631
- with gr.Row():
1632
- gr.Markdown("""
1633
- **Reranking English Leaderboard** 🥈
1634
-
1635
- - **Metric:** Mean Average Precision (MAP)
1636
- - **Languages:** English
1637
- """)
1638
- with gr.Row():
1639
- data_reranking = gr.components.Dataframe(
1640
- DATA_RERANKING,
1641
- datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
1642
- type="pandas",
1643
- )
1644
- with gr.Row():
1645
- data_run_reranking = gr.Button("Refresh")
1646
- data_run_reranking.click(
1647
- partial(get_mteb_data, tasks=["Reranking"], datasets=TASK_LIST_RERANKING),
1648
- outputs=data_reranking,
1649
- )
1650
- with gr.TabItem("Chinese"):
1651
- with gr.Row():
1652
- gr.Markdown("""
1653
- **Reranking Chinese Leaderboard** 🥈🇨🇳
1654
-
1655
- - **Metric:** Mean Average Precision (MAP)
1656
- - **Languages:** Chinese
1657
- - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1658
- """)
1659
- with gr.Row():
1660
- data_reranking_zh = gr.components.Dataframe(
1661
- DATA_RERANKING_ZH,
1662
- datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING_ZH.columns),
1663
- type="pandas",
1664
- )
1665
- with gr.Row():
1666
- data_run_reranking_zh = gr.Button("Refresh")
1667
- data_run_reranking_zh.click(
1668
- partial(get_mteb_data, tasks=["Reranking"], datasets=TASK_LIST_RERANKING_ZH),
1669
- outputs=data_reranking_zh,
1670
- )
1671
- with gr.TabItem("Retrieval"):
1672
- with gr.TabItem("English"):
1673
- with gr.Row():
1674
- gr.Markdown("""
1675
- **Retrieval English Leaderboard** 🔎
1676
-
1677
- - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1678
- - **Languages:** English
1679
- """)
1680
- with gr.Row():
1681
- data_retrieval = gr.components.Dataframe(
1682
- DATA_RETRIEVAL,
1683
- # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1684
- datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
1685
- type="pandas",
1686
- )
1687
- with gr.Row():
1688
- data_run_retrieval = gr.Button("Refresh")
1689
- data_run_retrieval.click(
1690
- partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL),
1691
- outputs=data_retrieval,
1692
- )
1693
- with gr.TabItem("Chinese"):
1694
- with gr.Row():
1695
- gr.Markdown("""
1696
- **Retrieval Chinese Leaderboard** 🔎🇨🇳
1697
-
1698
- - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1699
- - **Languages:** Chinese
1700
- - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1701
- """)
1702
- with gr.Row():
1703
- data_retrieval_zh = gr.components.Dataframe(
1704
- DATA_RETRIEVAL_ZH,
1705
- # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1706
- datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL_ZH.columns) * 2,
1707
- type="pandas",
1708
- )
1709
- with gr.Row():
1710
- data_run_retrieval_zh = gr.Button("Refresh")
1711
- data_run_retrieval_zh.click(
1712
- partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_ZH),
1713
- outputs=data_retrieval_zh,
1714
- )
1715
- with gr.TabItem("Polish"):
1716
- with gr.Row():
1717
- gr.Markdown("""
1718
- **Retrieval Polish Leaderboard** 🔎🇵🇱
1719
-
1720
- - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1721
- - **Languages:** Polish
1722
- - **Credits:** [Konrad Wojtasik](https://github.com/kwojtasi) & [BEIR-PL](https://arxiv.org/abs/2305.19840)
1723
- """)
1724
- with gr.Row():
1725
- data_retrieval_pl = gr.components.Dataframe(
1726
- DATA_RETRIEVAL_PL,
1727
- # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1728
- datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL_PL.columns) * 2,
1729
- type="pandas",
1730
- )
1731
- with gr.Row():
1732
- data_run_retrieval_pl = gr.Button("Refresh")
1733
- data_run_retrieval_pl.click(
1734
- partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_PL),
1735
- outputs=data_retrieval_pl,
1736
- )
1737
- with gr.TabItem("STS"):
1738
- with gr.TabItem("English"):
1739
- with gr.Row():
1740
- gr.Markdown("""
1741
- **STS English Leaderboard** 🤖
1742
-
1743
- - **Metric:** Spearman correlation based on cosine similarity
1744
- - **Languages:** English
1745
- """)
1746
- with gr.Row():
1747
- data_sts_en = gr.components.Dataframe(
1748
- DATA_STS_EN,
1749
- datatype=["number", "markdown"] + ["number"] * len(DATA_STS_EN.columns),
1750
- type="pandas",
1751
- )
1752
- with gr.Row():
1753
- data_run_sts_en = gr.Button("Refresh")
1754
- data_run_sts_en.click(
1755
- partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS),
1756
- outputs=data_sts_en,
1757
- )
1758
- with gr.TabItem("Chinese"):
1759
- with gr.Row():
1760
- gr.Markdown("""
1761
- **STS Chinese Leaderboard** 🤖🇨🇳
1762
-
1763
- - **Metric:** Spearman correlation based on cosine similarity
1764
- - **Languages:** Chinese
1765
- - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1766
- """)
1767
- with gr.Row():
1768
- data_sts_zh = gr.components.Dataframe(
1769
- DATA_STS_ZH,
1770
- datatype=["number", "markdown"] + ["number"] * len(DATA_STS_ZH.columns),
1771
- type="pandas",
1772
- )
1773
- with gr.Row():
1774
- data_run_sts_zh = gr.Button("Refresh")
1775
- data_run_sts_zh.click(
1776
- partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_ZH),
1777
- outputs=data_sts_zh,
1778
- )
1779
- with gr.TabItem("Polish"):
1780
- with gr.Row():
1781
- gr.Markdown("""
1782
- **STS Polish Leaderboard** 🤖🇵🇱
1783
-
1784
- - **Metric:** Spearman correlation based on cosine similarity
1785
- - **Languages:** Polish
1786
- - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1787
- """)
1788
- with gr.Row():
1789
- data_sts_pl = gr.components.Dataframe(
1790
- DATA_STS_PL,
1791
- datatype=["number", "markdown"] + ["number"] * len(DATA_STS_PL.columns),
1792
- type="pandas",
1793
- )
1794
- with gr.Row():
1795
- data_run_sts_pl = gr.Button("Refresh")
1796
- data_run_sts_pl.click(
1797
- partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_PL),
1798
- outputs=data_sts_pl,
1799
- )
1800
- with gr.TabItem("Other"):
1801
- with gr.Row():
1802
- gr.Markdown("""
1803
- **STS Other Leaderboard** 👽
1804
-
1805
- - **Metric:** Spearman correlation based on cosine similarity
1806
- - **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)
1807
- """)
1808
- with gr.Row():
1809
- data_sts_other = gr.components.Dataframe(
1810
- DATA_STS_OTHER,
1811
- datatype=["number", "markdown"] + ["number"] * len(DATA_STS_OTHER.columns) * 2,
1812
- type="pandas",
1813
- )
1814
- with gr.Row():
1815
- data_run_sts_other = gr.Button("Refresh")
1816
- data_run_sts_other.click(
1817
- partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_OTHER),
1818
- outputs=data_sts_other,
1819
- )
1820
- with gr.TabItem("Summarization"):
1821
- with gr.Row():
1822
- gr.Markdown("""
1823
- **Summarization Leaderboard** 📜
1824
-
1825
- - **Metric:** Spearman correlation based on cosine similarity
1826
- - **Languages:** English
1827
- """)
1828
- with gr.Row():
1829
- data_summarization = gr.components.Dataframe(
1830
- DATA_SUMMARIZATION,
1831
- datatype=["number", "markdown"] + ["number"] * 2,
1832
- type="pandas",
1833
- )
1834
- with gr.Row():
1835
- data_run = gr.Button("Refresh")
1836
- data_run.click(
1837
- partial(get_mteb_data, tasks=["Summarization"]),
1838
- outputs=data_summarization,
1839
- )
1840
- gr.Markdown(f"""
1841
- - **Total Datasets**: {NUM_DATASETS}
1842
- - **Total Languages**: 113
1843
- - **Total Scores**: {NUM_SCORES}
1844
- - **Total Models**: {NUM_MODELS}
1845
- """ + r"""
1846
- Made with ❤️ for NLP. If this work is useful to you, please consider citing:
1847
-
1848
- ```bibtex
1849
- @article{muennighoff2022mteb,
1850
- doi = {10.48550/ARXIV.2210.07316},
1851
- url = {https://arxiv.org/abs/2210.07316},
1852
- author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils},
1853
- title = {MTEB: Massive Text Embedding Benchmark},
1854
- publisher = {arXiv},
1855
- journal={arXiv preprint arXiv:2210.07316},
1856
- year = {2022}
1857
- }
1858
- ```
1859
- """)
1860
- # Running the functions on page load in addition to when the button is clicked
1861
- # This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
1862
- """
1863
- block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
1864
- """
1865
-
1866
- block.queue(max_size=10)
1867
- block.launch()
1868
-
1869
-
1870
- # Possible changes:
1871
- # Could add graphs / other visual content
1872
- # Could add verification marks
1873
-
1874
- # Sources:
1875
- # https://huggingface.co/spaces/gradio/leaderboard
1876
- # https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard
1877
- # https://getemoji.com/
 
929
  meta = metadata_load(readme_path)
930
  if "model-index" not in meta:
931
  continue
932
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933
  if len(datasets) > 0:
934
  task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
935
  elif langs:
 
963
  return df
964
 
965
  def get_mteb_average():
966
+ global DATA_OVERALL
967
  DATA_OVERALL = get_mteb_data(
968
  tasks=[
969
  "Classification",
 
996
 
997
  DATA_OVERALL = DATA_OVERALL.round(2)
998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  # Fill NaN after averaging
1000
  DATA_OVERALL.fillna("", inplace=True)
1001
 
 
1004
 
1005
  return DATA_OVERALL
1006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1007
  get_mteb_average()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
 
1009
  NUM_DATASETS = len(set(DATASETS))
1010
  # NUM_LANGUAGES = len(set(LANGUAGES))
1011
  NUM_MODELS = len(set(MODELS))
1012
 
1013
+ data_overall = gr.components.Dataframe(
1014
+ DATA_OVERALL,
1015
+ headers=list(DATA_OVERALL.columns)
1016
+ datatype=list(DATA_OVERALL.values),
1017
+ visible=False,
1018
+ line_breaks=False,
1019
+ interactive=False
1020
+ )
1021
+
1022
+ import unicodedata
1023
+
1024
+ def is_valid_unicode(char):
1025
+ try:
1026
+ unicodedata.name(char)
1027
+ return True # Valid Unicode character
1028
+ except ValueError:
1029
+ return False # Invalid Unicode character
1030
+
1031
+ def remove_invalid_unicode(input_string):
1032
+ if isinstance(input_string, str):
1033
+ valid_chars = [char for char in input_string if is_valid_unicode(char)]
1034
+ return ''.join(valid_chars)
1035
+ else:
1036
+ return input_string # Return non-string values as is
1037
 
1038
+ def display(x, y):
1039
+ global data_overall # Ensure we're accessing the global variable
 
1040
 
1041
+ # Check if leaderboard_table is indeed a DataFrame
1042
+ if isinstance(data_overall, pd.DataFrame):
1043
+ for column in data_overall.columns:
1044
+ if data_overall[column].dtype == 'object':
1045
+ data_overall[column] = data_overall[column].apply(remove_invalid_unicode)
1046
+
1047
+ # Ensure COLS is defined and valid; it's used to filter columns in the DataFrame
1048
+ subset_df = data_overall[COLS] # Assuming COLS is a list of column names you want to keep
1049
+ return subset_df
1050
+ else:
1051
+ print("leaderboard_table is not a DataFrame.")
1052
+
1053
+ dummy1 = gr.Textbox(visible=False)
1054
+
1055
+ INTRODUCTION_TEXT = """
1056
+ This is a copied space from LLM Trustworthy Leaderboard. Instead of displaying
1057
+ the results as table this space was modified to simply provides a gradio API interface.
1058
+ Using the following python script below, users can access the full leaderboard data easily.
1059
+ Python on how to access the data:
1060
+ ```python
1061
+ # Import dependencies
1062
+ from gradio_client import Client
1063
+ # Initialize the Gradio client with the API URL
1064
+ client = Client("https://rodrigomasini-data-only-llm-perf-leaderboard.hf.space/")
1065
+ try:
1066
+ # Perform the API call
1067
+ response = client.predict("","", api_name='/predict')
1068
+ # Check if response it's directly accessible
1069
+ if len(response) > 0:
1070
+ print("Response received!")
1071
+ headers = response.get('headers', [])
1072
+ data = response.get('data', [])
1073
+ print(headers)
1074
+ # Remove commenst if you want to download the dataset and save in csv format
1075
+ # Specify the path to your CSV file
1076
+ #csv_file_path = 'llm-perf-benchmark.csv'
1077
+ # Open the CSV file for writing
1078
+ #with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
1079
+ # writer = csv.writer(file)
1080
+ # Write the headers
1081
+ # writer.writerow(headers)
1082
+ # Write the data
1083
+ # for row in data:
1084
+ # writer.writerow(row)
1085
+ #print(f"Results saved to {csv_file_path}")
1086
+ # If the above line prints a string that looks like JSON, you can parse it with json.loads(response)
1087
+ # Otherwise, you might need to adjust based on the actual structure of `response`
1088
+ except Exception as e:
1089
+ print(f"An error occurred: {e}")
1090
+ ```
1091
  """
1092
 
1093
+ interface = gr.Interface(
1094
+ fn=display,
1095
+ inputs=[gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text"), dummy1],
1096
+ outputs=[data_overall]
1097
+ )
1098
+
1099
+ interface.launch()