Muennighoff commited on
Commit
dbfa15a
โ€ข
1 Parent(s): 3ffdc42

Add emojis

Browse files
Files changed (1) hide show
  1. app.py +87 -29
app.py CHANGED
@@ -3,8 +3,6 @@ import pandas as pd
3
  from huggingface_hub import HfApi, hf_hub_download
4
  from huggingface_hub.repocard import metadata_load
5
 
6
- path = f"https://huggingface.co/api/spaces"
7
-
8
  TASKS = [
9
  "BitextMining",
10
  "Classification",
@@ -185,15 +183,15 @@ def get_mteb_average(get_all_avgs=False):
185
  cast_to_str=False
186
  )
187
 
188
- DATA_OVERALL.insert(1, "Average", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False))
189
- DATA_OVERALL.insert(2, "Classification Average", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False))
190
- DATA_OVERALL.insert(3, "Clustering Average", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False))
191
- DATA_OVERALL.insert(4, "Pair Classification Average", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False))
192
- DATA_OVERALL.insert(5, "Reranking Average", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False))
193
- DATA_OVERALL.insert(6, "Retrieval Average", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False))
194
- DATA_OVERALL.insert(7, "STS Average", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False))
195
- DATA_OVERALL.insert(8, "Summarization Average", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False))
196
- DATA_OVERALL.sort_values("Average", ascending=False, inplace=True)
197
  # Start ranking from 1
198
  DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
199
 
@@ -207,7 +205,7 @@ def get_mteb_average(get_all_avgs=False):
207
  DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
208
  DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
209
 
210
- DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Average", "Classification Average", "Clustering Average", "Pair Classification Average", "Reranking Average", "Retrieval Average", "STS Average", "Summarization Average"]]
211
 
212
  return DATA_OVERALL
213
 
@@ -216,19 +214,27 @@ block = gr.Blocks()
216
 
217
 
218
  with block:
219
- gr.Markdown(
220
- """MTEB Leaderboard. See <a href="https://huggingface.co/Gradio-Blocks" target="_blank" style="text-decoration: underline">Blocks Party Event</a>"""
221
- )
 
 
 
 
222
  with gr.Tabs():
223
  with gr.TabItem("Overall"):
224
  with gr.Row():
225
- gr.Markdown("""Average Scores""")
 
 
 
 
 
226
  with gr.Row():
227
  data_overall = gr.components.Dataframe(
228
  DATA_OVERALL,
229
  datatype=["markdown"] * len(DATA_OVERALL.columns) * 2,
230
  type="pandas",
231
- #col_count=(len(DATA_OVERALL.columns), "fixed"),
232
  wrap=True,
233
  )
234
  with gr.Row():
@@ -236,7 +242,12 @@ with block:
236
  data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
237
  with gr.TabItem("BitextMining"):
238
  with gr.Row():
239
- gr.Markdown("""Leaderboard for Clustering""")
 
 
 
 
 
240
  with gr.Row():
241
  data_bitext_mining = gr.components.Dataframe(
242
  datatype=["markdown"] * 500, # hack when we don't know how many columns
@@ -253,7 +264,12 @@ with block:
253
  with gr.TabItem("Classification"):
254
  with gr.TabItem("English"):
255
  with gr.Row():
256
- gr.Markdown("""Leaderboard for Classification""")
 
 
 
 
 
257
  with gr.Row():
258
  data_classification_en = gr.components.Dataframe(
259
  DATA_CLASSIFICATION_EN,
@@ -274,7 +290,12 @@ with block:
274
  )
275
  with gr.TabItem("Multilingual"):
276
  with gr.Row():
277
- gr.Markdown("""Multilingual Classification""")
 
 
 
 
 
278
  with gr.Row():
279
  data_classification = gr.components.Dataframe(
280
  datatype=["markdown"] * 500, # hack when we don't know how many columns
@@ -290,7 +311,12 @@ with block:
290
  )
291
  with gr.TabItem("Clustering"):
292
  with gr.Row():
293
- gr.Markdown("""Leaderboard for Clustering""")
 
 
 
 
 
294
  with gr.Row():
295
  data_clustering = gr.components.Dataframe(
296
  DATA_CLUSTERING,
@@ -308,7 +334,12 @@ with block:
308
  )
309
  with gr.TabItem("Pair Classification"):
310
  with gr.Row():
311
- gr.Markdown("""Leaderboard for Pair Classification""")
 
 
 
 
 
312
  with gr.Row():
313
  data_pair_classification = gr.components.Dataframe(
314
  DATA_PAIR_CLASSIFICATION,
@@ -318,7 +349,7 @@ with block:
318
  )
319
  with gr.Row():
320
  data_run = gr.Button("Refresh")
321
- task_pair_classification = gr.Variable(value="Clustering")
322
  data_run.click(
323
  get_mteb_data,
324
  inputs=[task_pair_classification],
@@ -326,7 +357,12 @@ with block:
326
  )
327
  with gr.TabItem("Retrieval"):
328
  with gr.Row():
329
- gr.Markdown("""Leaderboard for Retrieval""")
 
 
 
 
 
330
  with gr.Row():
331
  data_retrieval = gr.components.Dataframe(
332
  DATA_RETRIEVAL,
@@ -341,7 +377,12 @@ with block:
341
  )
342
  with gr.TabItem("Reranking"):
343
  with gr.Row():
344
- gr.Markdown("""Leaderboard for Reranking""")
 
 
 
 
 
345
  with gr.Row():
346
  data_reranking = gr.components.Dataframe(
347
  DATA_RERANKING,
@@ -359,7 +400,12 @@ with block:
359
  with gr.TabItem("STS"):
360
  with gr.TabItem("English"):
361
  with gr.Row():
362
- gr.Markdown("""Leaderboard for STS""")
 
 
 
 
 
363
  with gr.Row():
364
  data_sts_en = gr.components.Dataframe(
365
  DATA_STS_EN,
@@ -378,7 +424,12 @@ with block:
378
  )
379
  with gr.TabItem("Multilingual"):
380
  with gr.Row():
381
- gr.Markdown("""Leaderboard for STS""")
 
 
 
 
 
382
  with gr.Row():
383
  data_sts = gr.components.Dataframe(
384
  datatype=["markdown"] * 50, # hack when we don't know how many columns
@@ -390,7 +441,12 @@ with block:
390
  data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts)
391
  with gr.TabItem("Summarization"):
392
  with gr.Row():
393
- gr.Markdown("""Leaderboard for Summarization""")
 
 
 
 
 
394
  with gr.Row():
395
  data_summarization = gr.components.Dataframe(
396
  DATA_SUMMARIZATION,
@@ -406,13 +462,15 @@ with block:
406
  inputs=[task_summarization],
407
  outputs=data_summarization,
408
  )
409
- # running the function on page load in addition to when the button is clicked
 
410
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
411
  block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
412
  block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
413
  block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
414
  block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
415
  block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
 
416
  block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
417
  block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
418
 
 
3
  from huggingface_hub import HfApi, hf_hub_download
4
  from huggingface_hub.repocard import metadata_load
5
 
 
 
6
  TASKS = [
7
  "BitextMining",
8
  "Classification",
 
183
  cast_to_str=False
184
  )
185
 
186
+ DATA_OVERALL.insert(1, f"Average ({len(TASK_LIST_EN)} datasets)", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False))
187
+ DATA_OVERALL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False))
188
+ DATA_OVERALL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False))
189
+ DATA_OVERALL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False))
190
+ DATA_OVERALL.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False))
191
+ DATA_OVERALL.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False))
192
+ DATA_OVERALL.insert(7, f"STS Average ({len(TASK_LIST_STS)} datasets)", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False))
193
+ DATA_OVERALL.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False))
194
+ DATA_OVERALL.sort_values(f"Average ({len(TASK_LIST_EN)} datasets)", ascending=False, inplace=True)
195
  # Start ranking from 1
196
  DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
197
 
 
205
  DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
206
  DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
207
 
208
+ DATA_OVERALL = DATA_OVERALL[["Rank", "Model", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
209
 
210
  return DATA_OVERALL
211
 
 
214
 
215
 
216
  with block:
217
+ gr.Markdown(f"""
218
+ Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> ๐Ÿค—
219
+
220
+ - **Total Scores**: TODO
221
+ - **Total Models**: {len(DATA_OVERALL)}
222
+ - **Total Users**: TODO
223
+ """)
224
  with gr.Tabs():
225
  with gr.TabItem("Overall"):
226
  with gr.Row():
227
+ gr.Markdown("""
228
+ **Overall MTEB English leaderboard ๐Ÿ”ฎ**
229
+
230
+ - **Metric:** Various, refer to task tabs
231
+ - **Languages:** English, refer to task tabs for others
232
+ """)
233
  with gr.Row():
234
  data_overall = gr.components.Dataframe(
235
  DATA_OVERALL,
236
  datatype=["markdown"] * len(DATA_OVERALL.columns) * 2,
237
  type="pandas",
 
238
  wrap=True,
239
  )
240
  with gr.Row():
 
242
  data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
243
  with gr.TabItem("BitextMining"):
244
  with gr.Row():
245
+ gr.Markdown("""
246
+ **Bitext Mining Leaderboard ๐ŸŽŒ**
247
+
248
+ - **Metric:** Accuracy (accuracy)
249
+ - **Languages:** 117
250
+ """)
251
  with gr.Row():
252
  data_bitext_mining = gr.components.Dataframe(
253
  datatype=["markdown"] * 500, # hack when we don't know how many columns
 
264
  with gr.TabItem("Classification"):
265
  with gr.TabItem("English"):
266
  with gr.Row():
267
+ gr.Markdown("""
268
+ **Classification Leaderboard โค๏ธ**
269
+
270
+ - **Metric:** Accuracy (accuracy)
271
+ - **Languages:** English
272
+ """)
273
  with gr.Row():
274
  data_classification_en = gr.components.Dataframe(
275
  DATA_CLASSIFICATION_EN,
 
290
  )
291
  with gr.TabItem("Multilingual"):
292
  with gr.Row():
293
+ gr.Markdown("""
294
+ **Classification Multilingual Leaderboard ๐Ÿ’œ๐Ÿ’š๐Ÿ’™**
295
+
296
+ - **Metric:** Accuracy (accuracy)
297
+ - **Languages:** 51
298
+ """)
299
  with gr.Row():
300
  data_classification = gr.components.Dataframe(
301
  datatype=["markdown"] * 500, # hack when we don't know how many columns
 
311
  )
312
  with gr.TabItem("Clustering"):
313
  with gr.Row():
314
+ gr.Markdown("""
315
+ **Clustering Leaderboard โœจ**
316
+
317
+ - **Metric:** Validity Measure (v_measure)
318
+ - **Languages:** English
319
+ """)
320
  with gr.Row():
321
  data_clustering = gr.components.Dataframe(
322
  DATA_CLUSTERING,
 
334
  )
335
  with gr.TabItem("Pair Classification"):
336
  with gr.Row():
337
+ gr.Markdown("""
338
+ **Pair Classification Leaderboard ๐ŸŽญ**
339
+
340
+ - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
341
+ - **Languages:** English
342
+ """)
343
  with gr.Row():
344
  data_pair_classification = gr.components.Dataframe(
345
  DATA_PAIR_CLASSIFICATION,
 
349
  )
350
  with gr.Row():
351
  data_run = gr.Button("Refresh")
352
+ task_pair_classification = gr.Variable(value="PairClassification")
353
  data_run.click(
354
  get_mteb_data,
355
  inputs=[task_pair_classification],
 
357
  )
358
  with gr.TabItem("Retrieval"):
359
  with gr.Row():
360
+ gr.Markdown("""
361
+ **Retrieval Leaderboard ๐Ÿ”Ž**
362
+
363
+ - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
364
+ - **Languages:** English
365
+ """)
366
  with gr.Row():
367
  data_retrieval = gr.components.Dataframe(
368
  DATA_RETRIEVAL,
 
377
  )
378
  with gr.TabItem("Reranking"):
379
  with gr.Row():
380
+ gr.Markdown("""
381
+ **Reranking Leaderboard ๐Ÿฅ‡**
382
+
383
+ - **Metric:** Mean Average Precision (MAP)
384
+ - **Languages:** English
385
+ """)
386
  with gr.Row():
387
  data_reranking = gr.components.Dataframe(
388
  DATA_RERANKING,
 
400
  with gr.TabItem("STS"):
401
  with gr.TabItem("English"):
402
  with gr.Row():
403
+ gr.Markdown("""
404
+ **STS Leaderboard ๐Ÿค–**
405
+
406
+ - **Metric:** Spearman correlation based on cosine similarity
407
+ - **Languages:** English
408
+ """)
409
  with gr.Row():
410
  data_sts_en = gr.components.Dataframe(
411
  DATA_STS_EN,
 
424
  )
425
  with gr.TabItem("Multilingual"):
426
  with gr.Row():
427
+ gr.Markdown("""
428
+ **STS Multilingual Leaderboard ๐Ÿ‘ฝ**
429
+
430
+ - **Metric:** Spearman correlation based on cosine similarity
431
+ - **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish
432
+ """)
433
  with gr.Row():
434
  data_sts = gr.components.Dataframe(
435
  datatype=["markdown"] * 50, # hack when we don't know how many columns
 
441
  data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts)
442
  with gr.TabItem("Summarization"):
443
  with gr.Row():
444
+ gr.Markdown("""
445
+ **Summarization Leaderboard ๐Ÿ“œ**
446
+
447
+ - **Metric:** Spearman correlation based on cosine similarity
448
+ - **Languages:** English
449
+ """)
450
  with gr.Row():
451
  data_summarization = gr.components.Dataframe(
452
  DATA_SUMMARIZATION,
 
462
  inputs=[task_summarization],
463
  outputs=data_summarization,
464
  )
465
+ # Running the function on page load in addition to when the button is clicked
466
+ # This is optional - If deactivated the data created loaded at "Build time" is shown like for Overall tab
467
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
468
  block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
469
  block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
470
  block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
471
  block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
472
  block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
473
+ block.load(get_mteb_data, inputs=[task_sts_en], outputs=data_sts_en)
474
  block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
475
  block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
476