Spaces:
Running
Running
Muennighoff
commited on
Commit
•
f61dd83
1
Parent(s):
099d855
Cache everything; Add rankings everwhere; Automate num dataset/score computation
Browse files
app.py
CHANGED
@@ -393,6 +393,7 @@ MODELS_TO_SKIP = {
|
|
393 |
"anttip/ct2fast-e5-small-v2-hfie",
|
394 |
"newsrx/instructor-large",
|
395 |
"newsrx/instructor-xl",
|
|
|
396 |
}
|
397 |
|
398 |
|
@@ -471,7 +472,20 @@ def get_dim_seq_size(model):
|
|
471 |
size = round(size["metadata"]["total_size"] / 1e9, 2)
|
472 |
return dim, seq, size
|
473 |
|
474 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
api = HfApi()
|
476 |
models = api.list_models(filter="mteb")
|
477 |
# Initialize list to models that we cannot fetch metadata from
|
@@ -532,6 +546,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
532 |
cols = sorted(list(df.columns))
|
533 |
cols.insert(0, cols.pop(cols.index("Model")))
|
534 |
df = df[cols]
|
|
|
|
|
535 |
if fillna:
|
536 |
df.fillna("", inplace=True)
|
537 |
return df
|
@@ -551,10 +567,8 @@ def get_mteb_average():
|
|
551 |
langs=["en", "en-en"],
|
552 |
fillna=False,
|
553 |
add_emb_dim=True,
|
|
|
554 |
)
|
555 |
-
# Approximation (Missing Bitext Mining & including some nans)
|
556 |
-
NUM_SCORES = DATA_OVERALL.shape[0] * DATA_OVERALL.shape[1]
|
557 |
-
|
558 |
# Debugging:
|
559 |
# DATA_OVERALL.to_csv("overall.csv")
|
560 |
|
@@ -572,32 +586,51 @@ def get_mteb_average():
|
|
572 |
|
573 |
DATA_OVERALL = DATA_OVERALL.round(2)
|
574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
# Fill NaN after averaging
|
576 |
DATA_OVERALL.fillna("", inplace=True)
|
577 |
|
578 |
-
DATA_CLASSIFICATION_EN = DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION]
|
579 |
-
DATA_CLUSTERING = DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING]
|
580 |
-
DATA_PAIR_CLASSIFICATION = DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION]
|
581 |
-
DATA_RERANKING = DATA_OVERALL[["Model"] + TASK_LIST_RERANKING]
|
582 |
-
DATA_RETRIEVAL = DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL]
|
583 |
-
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
584 |
-
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
585 |
-
|
586 |
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
587 |
|
588 |
return DATA_OVERALL
|
589 |
|
590 |
get_mteb_average()
|
591 |
-
|
592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
|
|
|
594 |
with block:
|
595 |
gr.Markdown(f"""
|
596 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
|
597 |
|
598 |
-
- **Total Datasets**:
|
599 |
- **Total Languages**: 112
|
600 |
-
- **Total Scores**:
|
601 |
- **Total Models**: {len(DATA_OVERALL)}
|
602 |
""")
|
603 |
with gr.Tabs():
|
@@ -629,7 +662,8 @@ with block:
|
|
629 |
""")
|
630 |
with gr.Row():
|
631 |
data_bitext_mining = gr.components.Dataframe(
|
632 |
-
|
|
|
633 |
type="pandas",
|
634 |
)
|
635 |
with gr.Row():
|
@@ -652,7 +686,7 @@ with block:
|
|
652 |
with gr.Row():
|
653 |
data_classification_en = gr.components.Dataframe(
|
654 |
DATA_CLASSIFICATION_EN,
|
655 |
-
datatype=["markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
|
656 |
type="pandas",
|
657 |
)
|
658 |
with gr.Row():
|
@@ -677,7 +711,8 @@ with block:
|
|
677 |
""")
|
678 |
with gr.Row():
|
679 |
data_classification = gr.components.Dataframe(
|
680 |
-
|
|
|
681 |
type="pandas",
|
682 |
)
|
683 |
with gr.Row():
|
@@ -700,7 +735,7 @@ with block:
|
|
700 |
with gr.Row():
|
701 |
data_clustering = gr.components.Dataframe(
|
702 |
DATA_CLUSTERING,
|
703 |
-
datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
|
704 |
type="pandas",
|
705 |
)
|
706 |
with gr.Row():
|
@@ -724,7 +759,8 @@ with block:
|
|
724 |
""")
|
725 |
with gr.Row():
|
726 |
data_clustering_de = gr.components.Dataframe(
|
727 |
-
|
|
|
728 |
type="pandas",
|
729 |
)
|
730 |
with gr.Row():
|
@@ -748,7 +784,7 @@ with block:
|
|
748 |
with gr.Row():
|
749 |
data_pair_classification = gr.components.Dataframe(
|
750 |
DATA_PAIR_CLASSIFICATION,
|
751 |
-
datatype=["markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
|
752 |
type="pandas",
|
753 |
)
|
754 |
with gr.Row():
|
@@ -771,7 +807,7 @@ with block:
|
|
771 |
data_retrieval = gr.components.Dataframe(
|
772 |
DATA_RETRIEVAL,
|
773 |
# Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
|
774 |
-
datatype=["markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
|
775 |
type="pandas",
|
776 |
)
|
777 |
with gr.Row():
|
@@ -791,7 +827,7 @@ with block:
|
|
791 |
with gr.Row():
|
792 |
data_reranking = gr.components.Dataframe(
|
793 |
DATA_RERANKING,
|
794 |
-
datatype=["markdown"] + ["number"] * len(DATA_RERANKING.columns),
|
795 |
type="pandas",
|
796 |
)
|
797 |
with gr.Row():
|
@@ -813,7 +849,7 @@ with block:
|
|
813 |
with gr.Row():
|
814 |
data_sts_en = gr.components.Dataframe(
|
815 |
DATA_STS_EN,
|
816 |
-
datatype=["markdown"] + ["number"] * len(DATA_STS_EN.columns),
|
817 |
type="pandas",
|
818 |
)
|
819 |
with gr.Row():
|
@@ -835,7 +871,8 @@ with block:
|
|
835 |
""")
|
836 |
with gr.Row():
|
837 |
data_sts = gr.components.Dataframe(
|
838 |
-
|
|
|
839 |
type="pandas",
|
840 |
)
|
841 |
with gr.Row():
|
@@ -853,7 +890,7 @@ with block:
|
|
853 |
with gr.Row():
|
854 |
data_summarization = gr.components.Dataframe(
|
855 |
DATA_SUMMARIZATION,
|
856 |
-
datatype=["markdown"] + ["number"] * 2,
|
857 |
type="pandas",
|
858 |
)
|
859 |
with gr.Row():
|
@@ -880,8 +917,9 @@ with block:
|
|
880 |
}
|
881 |
```
|
882 |
""")
|
883 |
-
# Running the
|
884 |
-
# This is optional - If deactivated the data
|
|
|
885 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
886 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
887 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
@@ -893,6 +931,7 @@ with block:
|
|
893 |
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
|
894 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
895 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
|
|
896 |
|
897 |
block.queue(concurrency_count=40, max_size=10)
|
898 |
block.launch()
|
|
|
393 |
"anttip/ct2fast-e5-small-v2-hfie",
|
394 |
"newsrx/instructor-large",
|
395 |
"newsrx/instructor-xl",
|
396 |
+
"dmlls/all-mpnet-base-v2",
|
397 |
}
|
398 |
|
399 |
|
|
|
472 |
size = round(size["metadata"]["total_size"] / 1e9, 2)
|
473 |
return dim, seq, size
|
474 |
|
475 |
+
def add_rank(df):
|
476 |
+
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length"]]
|
477 |
+
if len(cols_to_rank) == 1:
|
478 |
+
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
479 |
+
else:
|
480 |
+
df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
|
481 |
+
df.sort_values("Average", ascending=False, inplace=True)
|
482 |
+
df.insert(0, "Rank", list(range(1, len(df) + 1)))
|
483 |
+
df = df.round(2)
|
484 |
+
# Fill NaN after averaging
|
485 |
+
df.fillna("", inplace=True)
|
486 |
+
return df
|
487 |
+
|
488 |
+
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC, rank=True):
|
489 |
api = HfApi()
|
490 |
models = api.list_models(filter="mteb")
|
491 |
# Initialize list to models that we cannot fetch metadata from
|
|
|
546 |
cols = sorted(list(df.columns))
|
547 |
cols.insert(0, cols.pop(cols.index("Model")))
|
548 |
df = df[cols]
|
549 |
+
if rank:
|
550 |
+
df = add_rank(df)
|
551 |
if fillna:
|
552 |
df.fillna("", inplace=True)
|
553 |
return df
|
|
|
567 |
langs=["en", "en-en"],
|
568 |
fillna=False,
|
569 |
add_emb_dim=True,
|
570 |
+
rank=False,
|
571 |
)
|
|
|
|
|
|
|
572 |
# Debugging:
|
573 |
# DATA_OVERALL.to_csv("overall.csv")
|
574 |
|
|
|
586 |
|
587 |
DATA_OVERALL = DATA_OVERALL.round(2)
|
588 |
|
589 |
+
DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION])
|
590 |
+
DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING])
|
591 |
+
DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION])
|
592 |
+
DATA_RERANKING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RERANKING])
|
593 |
+
DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL])
|
594 |
+
DATA_STS_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_STS])
|
595 |
+
DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION])
|
596 |
+
|
597 |
# Fill NaN after averaging
|
598 |
DATA_OVERALL.fillna("", inplace=True)
|
599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
600 |
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
601 |
|
602 |
return DATA_OVERALL
|
603 |
|
604 |
get_mteb_average()
|
605 |
+
DATA_BITEXT_MINING = get_mteb_data(["BitextMining"])
|
606 |
+
DATA_CLASSIFICATION = get_mteb_data(["Classification"])
|
607 |
+
DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
608 |
+
DATA_STS = get_mteb_data(["STS"])
|
609 |
+
|
610 |
+
# Exact, add all non-nan integer values for every dataset
|
611 |
+
NUM_SCORES = 0
|
612 |
+
DATASETS = []
|
613 |
+
# LANGUAGES = []
|
614 |
+
for d in [DATA_BITEXT_MINING, DATA_CLASSIFICATION, DATA_CLUSTERING, DATA_CLUSTERING_GERMAN, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS, DATA_SUMMARIZATION]:
|
615 |
+
# NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
|
616 |
+
cols_to_ignore = 3 if "Average" in d.columns else 2
|
617 |
+
# Count number of scores including only non-nan floats & excluding the rank column
|
618 |
+
NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
|
619 |
+
# Exclude rank & model name column (first two); Do not count different language versions as different datasets
|
620 |
+
DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
|
621 |
+
# LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
|
622 |
+
|
623 |
+
NUM_DATASETS = len(set(DATASETS))
|
624 |
+
# NUM_LANGUAGES = len(set(LANGUAGES))
|
625 |
|
626 |
+
block = gr.Blocks()
|
627 |
with block:
|
628 |
gr.Markdown(f"""
|
629 |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
|
630 |
|
631 |
+
- **Total Datasets**: {NUM_DATASETS}
|
632 |
- **Total Languages**: 112
|
633 |
+
- **Total Scores**: {NUM_SCORES}
|
634 |
- **Total Models**: {len(DATA_OVERALL)}
|
635 |
""")
|
636 |
with gr.Tabs():
|
|
|
662 |
""")
|
663 |
with gr.Row():
|
664 |
data_bitext_mining = gr.components.Dataframe(
|
665 |
+
DATA_BITEXT_MINING,
|
666 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING.columns),
|
667 |
type="pandas",
|
668 |
)
|
669 |
with gr.Row():
|
|
|
686 |
with gr.Row():
|
687 |
data_classification_en = gr.components.Dataframe(
|
688 |
DATA_CLASSIFICATION_EN,
|
689 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
|
690 |
type="pandas",
|
691 |
)
|
692 |
with gr.Row():
|
|
|
711 |
""")
|
712 |
with gr.Row():
|
713 |
data_classification = gr.components.Dataframe(
|
714 |
+
DATA_CLASSIFICATION,
|
715 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION) * 10,
|
716 |
type="pandas",
|
717 |
)
|
718 |
with gr.Row():
|
|
|
735 |
with gr.Row():
|
736 |
data_clustering = gr.components.Dataframe(
|
737 |
DATA_CLUSTERING,
|
738 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
|
739 |
type="pandas",
|
740 |
)
|
741 |
with gr.Row():
|
|
|
759 |
""")
|
760 |
with gr.Row():
|
761 |
data_clustering_de = gr.components.Dataframe(
|
762 |
+
DATA_CLUSTERING_GERMAN,
|
763 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_GERMAN.columns) * 2,
|
764 |
type="pandas",
|
765 |
)
|
766 |
with gr.Row():
|
|
|
784 |
with gr.Row():
|
785 |
data_pair_classification = gr.components.Dataframe(
|
786 |
DATA_PAIR_CLASSIFICATION,
|
787 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
|
788 |
type="pandas",
|
789 |
)
|
790 |
with gr.Row():
|
|
|
807 |
data_retrieval = gr.components.Dataframe(
|
808 |
DATA_RETRIEVAL,
|
809 |
# Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
|
810 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
|
811 |
type="pandas",
|
812 |
)
|
813 |
with gr.Row():
|
|
|
827 |
with gr.Row():
|
828 |
data_reranking = gr.components.Dataframe(
|
829 |
DATA_RERANKING,
|
830 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
|
831 |
type="pandas",
|
832 |
)
|
833 |
with gr.Row():
|
|
|
849 |
with gr.Row():
|
850 |
data_sts_en = gr.components.Dataframe(
|
851 |
DATA_STS_EN,
|
852 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_STS_EN.columns),
|
853 |
type="pandas",
|
854 |
)
|
855 |
with gr.Row():
|
|
|
871 |
""")
|
872 |
with gr.Row():
|
873 |
data_sts = gr.components.Dataframe(
|
874 |
+
DATA_STS,
|
875 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_STS.columns) * 2,
|
876 |
type="pandas",
|
877 |
)
|
878 |
with gr.Row():
|
|
|
890 |
with gr.Row():
|
891 |
data_summarization = gr.components.Dataframe(
|
892 |
DATA_SUMMARIZATION,
|
893 |
+
datatype=["number", "markdown"] + ["number"] * 2,
|
894 |
type="pandas",
|
895 |
)
|
896 |
with gr.Row():
|
|
|
917 |
}
|
918 |
```
|
919 |
""")
|
920 |
+
# Running the functions on page load in addition to when the button is clicked
|
921 |
+
# This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
|
922 |
+
"""
|
923 |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
|
924 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
925 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
|
|
931 |
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
|
932 |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
|
933 |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
|
934 |
+
"""
|
935 |
|
936 |
block.queue(concurrency_count=40, max_size=10)
|
937 |
block.launch()
|