Muennighoff commited on
Commit
4d67578
1 Parent(s): 803802d

Add German Clustering; Rmv Models w/o score; Rmv dups; Increment ds

Browse files
Files changed (1) hide show
  1. app.py +76 -34
app.py CHANGED
@@ -48,6 +48,13 @@ TASK_LIST_CLUSTERING = [
48
  "TwentyNewsgroupsClustering",
49
  ]
50
 
 
 
 
 
 
 
 
51
  TASK_LIST_PAIR_CLASSIFICATION = [
52
  "SprintDuplicateQuestions",
53
  "TwitterSemEval2015",
@@ -117,6 +124,7 @@ TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_
117
  TASK_TO_METRIC = {
118
  "BitextMining": "f1",
119
  "Clustering": "v_measure",
 
120
  "Classification": "accuracy",
121
  "PairClassification": "cos_sim_ap",
122
  "Reranking": "map",
@@ -255,6 +263,9 @@ MODELS_TO_SKIP = {
255
  "radames/e5-large", # Duplicate
256
  "gentlebowl/instructor-large-safetensors", # Duplicate
257
  "Consensus/instructor-base", # Duplicate
 
 
 
258
  }
259
 
260
 
@@ -271,7 +282,7 @@ def add_task(examples):
271
  # Could be added to the dataset loading script instead
272
  if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM:
273
  examples["mteb_task"] = "Classification"
274
- elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING:
275
  examples["mteb_task"] = "Clustering"
276
  elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION:
277
  examples["mteb_task"] = "PairClassification"
@@ -288,7 +299,7 @@ def add_task(examples):
288
  return examples
289
 
290
  for model in EXTERNAL_MODELS:
291
- ds = load_dataset("mteb/results", model, download_mode='force_redownload', verification_mode="no_checks")
292
  # For local debugging:
293
  #, download_mode='force_redownload', verification_mode="no_checks")
294
  ds = ds.map(add_lang)
@@ -321,14 +332,16 @@ def get_emb_dim(model):
321
  return dim
322
 
323
 
324
- def get_mteb_data(tasks=["Clustering"], langs=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
325
  api = HfApi()
326
  models = api.list_models(filter="mteb")
327
  # Initialize list to models that we cannot fetch metadata from
328
  df_list = []
329
  for model in EXTERNAL_MODEL_RESULTS:
330
  results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
331
- if langs:
 
 
332
  # Would be cleaner to rely on an extra language column instead
333
  langs_format = [f"({lang})" for lang in langs]
334
  res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
@@ -359,16 +372,20 @@ def get_mteb_data(tasks=["Clustering"], langs=[], fillna=True, add_emb_dim=False
359
  # ],
360
  # },
361
  # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
362
- if langs:
 
 
363
  task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
364
  else:
365
  task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
366
  out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
367
  out = {k: v for d in out for k, v in d.items()}
368
  out["Model"] = make_clickable_model(model.modelId)
369
- if add_emb_dim:
370
- out["Embedding Dimensions"] = get_emb_dim(model)
371
- df_list.append(out)
 
 
372
  df = pd.DataFrame(df_list)
373
  # Put 'Model' column first
374
  cols = sorted(list(df.columns))
@@ -437,7 +454,7 @@ with block:
437
  gr.Markdown(f"""
438
  Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗
439
 
440
- - **Total Datasets**: 58
441
  - **Total Languages**: 112
442
  - **Total Scores**: >{NUM_SCORES}
443
  - **Total Models**: {len(DATA_OVERALL)}
@@ -531,27 +548,53 @@ with block:
531
  outputs=data_classification,
532
  )
533
  with gr.TabItem("Clustering"):
534
- with gr.Row():
535
- gr.Markdown("""
536
- **Clustering Leaderboard ✨**
537
-
538
- - **Metric:** Validity Measure (v_measure)
539
- - **Languages:** English
540
- """)
541
- with gr.Row():
542
- data_clustering = gr.components.Dataframe(
543
- DATA_CLUSTERING,
544
- datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
545
- type="pandas",
546
- )
547
- with gr.Row():
548
- data_run = gr.Button("Refresh")
549
- task_clustering = gr.Variable(value=["Clustering"])
550
- data_run.click(
551
- get_mteb_data,
552
- inputs=[task_clustering],
553
- outputs=data_clustering,
554
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  with gr.TabItem("Pair Classification"):
556
  with gr.Row():
557
  gr.Markdown("""
@@ -681,9 +724,7 @@ with block:
681
  )
682
  gr.Markdown(r"""
683
 
684
- Made with ❤️ for NLP
685
-
686
- If this work is useful to you, please consider citing:
687
 
688
  ```bibtex
689
  @article{muennighoff2022mteb,
@@ -702,7 +743,8 @@ with block:
702
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
703
  block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
704
  block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
705
- block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
 
706
  block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
707
  block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
708
  block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
 
48
  "TwentyNewsgroupsClustering",
49
  ]
50
 
51
+ TASK_LIST_CLUSTERING_DE = [
52
+ "BlurbsClusteringP2P",
53
+ "BlurbsClusteringS2S",
54
+ "TenKGnadClusteringP2P",
55
+ "TenKGnadClusteringS2S",
56
+ ]
57
+
58
  TASK_LIST_PAIR_CLASSIFICATION = [
59
  "SprintDuplicateQuestions",
60
  "TwitterSemEval2015",
 
124
  TASK_TO_METRIC = {
125
  "BitextMining": "f1",
126
  "Clustering": "v_measure",
127
+ "Clustering (DE)": "v_measure",
128
  "Classification": "accuracy",
129
  "PairClassification": "cos_sim_ap",
130
  "Reranking": "map",
 
263
  "radames/e5-large", # Duplicate
264
  "gentlebowl/instructor-large-safetensors", # Duplicate
265
  "Consensus/instructor-base", # Duplicate
266
+ "GovCompete/instructor-xl", # Duplicate
267
+ "GovCompete/e5-large-v2", # Duplicate
268
+ "t12e/instructor-base", # Duplicate
269
  }
270
 
271
 
 
282
  # Could be added to the dataset loading script instead
283
  if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM:
284
  examples["mteb_task"] = "Classification"
285
+ elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE:
286
  examples["mteb_task"] = "Clustering"
287
  elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION:
288
  examples["mteb_task"] = "PairClassification"
 
299
  return examples
300
 
301
  for model in EXTERNAL_MODELS:
302
+ ds = load_dataset("mteb/results", model)#, download_mode='force_redownload', verification_mode="no_checks")
303
  # For local debugging:
304
  #, download_mode='force_redownload', verification_mode="no_checks")
305
  ds = ds.map(add_lang)
 
332
  return dim
333
 
334
 
335
+ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
336
  api = HfApi()
337
  models = api.list_models(filter="mteb")
338
  # Initialize list to models that we cannot fetch metadata from
339
  df_list = []
340
  for model in EXTERNAL_MODEL_RESULTS:
341
  results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
342
+ if len(datasets) > 0:
343
+ res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
344
+ elif langs:
345
  # Would be cleaner to rely on an extra language column instead
346
  langs_format = [f"({lang})" for lang in langs]
347
  res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
 
372
  # ],
373
  # },
374
  # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
375
+ if len(datasets) > 0:
376
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
377
+ elif langs:
378
  task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
379
  else:
380
  task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
381
  out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
382
  out = {k: v for d in out for k, v in d.items()}
383
  out["Model"] = make_clickable_model(model.modelId)
384
+ # Model & at least one result
385
+ if len(out) > 1:
386
+ if add_emb_dim:
387
+ out["Embedding Dimensions"] = get_emb_dim(model)
388
+ df_list.append(out)
389
  df = pd.DataFrame(df_list)
390
  # Put 'Model' column first
391
  cols = sorted(list(df.columns))
 
454
  gr.Markdown(f"""
455
  Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗
456
 
457
+ - **Total Datasets**: 62
458
  - **Total Languages**: 112
459
  - **Total Scores**: >{NUM_SCORES}
460
  - **Total Models**: {len(DATA_OVERALL)}
 
548
  outputs=data_classification,
549
  )
550
  with gr.TabItem("Clustering"):
551
+ with gr.TabItem("English"):
552
+ with gr.Row():
553
+ gr.Markdown("""
554
+ **Clustering Leaderboard ✨**
555
+
556
+ - **Metric:** Validity Measure (v_measure)
557
+ - **Languages:** English
558
+ """)
559
+ with gr.Row():
560
+ data_clustering = gr.components.Dataframe(
561
+ DATA_CLUSTERING,
562
+ datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
563
+ type="pandas",
564
+ )
565
+ with gr.Row():
566
+ data_run = gr.Button("Refresh")
567
+ task_clustering = gr.Variable(value=["Clustering"])
568
+ empty = gr.Variable(value=[])
569
+ datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
570
+ data_run.click(
571
+ get_mteb_data,
572
+ inputs=[task_clustering, empty, datasets_clustering],
573
+ outputs=data_clustering,
574
+ )
575
+ with gr.TabItem("German"):
576
+ with gr.Row():
577
+ gr.Markdown("""
578
+ **Clustering Leaderboard ✨🇩🇪**
579
+
580
+ - **Metric:** Validity Measure (v_measure)
581
+ - **Languages:** German
582
+ """)
583
+ with gr.Row():
584
+ data_clustering_de = gr.components.Dataframe(
585
+ datatype=["markdown"] + ["number"] * len(TASK_LIST_CLUSTERING_DE),
586
+ type="pandas",
587
+ )
588
+ with gr.Row():
589
+ data_run = gr.Button("Refresh")
590
+ task_clustering_de = gr.Variable(value=["Clustering"])
591
+ empty_de = gr.Variable(value=[])
592
+ datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
593
+ data_run.click(
594
+ get_mteb_data,
595
+ inputs=[task_clustering_de, empty_de, datasets_clustering_de],
596
+ outputs=data_clustering_de,
597
+ )
598
  with gr.TabItem("Pair Classification"):
599
  with gr.Row():
600
  gr.Markdown("""
 
724
  )
725
  gr.Markdown(r"""
726
 
727
+ Made with ❤️ for NLP. If this work is useful to you, please consider citing:
 
 
728
 
729
  ```bibtex
730
  @article{muennighoff2022mteb,
 
743
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
744
  block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
745
  block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
746
+ block.load(get_mteb_data, inputs=[task_clustering, empty, datasets_clustering], outputs=data_clustering)
747
+ block.load(get_mteb_data, inputs=[task_clustering_de, empty_de, datasets_clustering_de], outputs=data_clustering_de)
748
  block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
749
  block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
750
  block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)