leaderboard

Sleeping

orionweller commited on May 8, 2024

Commit

05875e9

1 Parent(s): 1b70983

add_followir_tab (#102)

- add instruction following (98e437fad08d4528d401497d70f52921adaec46a)
- update (c348ee586c774fc357278f25fc1e0a099acc687f)
- merge in main (9fc87322eb9d1862ea02ec4fb63ee99dcce81533)
- minor cleanup (2ba40c7d9fb787ca0017e32a6c68a32c77df0221)
- add bi-encoder button (77cc9e7a65257c5af5784bb60a3dac2073e7fe05)

Files changed (4) hide show

EXTERNAL_MODEL_RESULTS.json +0 -0
app.py +39 -7
config.yaml +25 -0
model_meta.yaml +134 -0

EXTERNAL_MODEL_RESULTS.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -17,6 +17,11 @@ TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"]
 BOARDS_CONFIG = LEADERBOARD_CONFIG["boards"]
 TASKS = list(TASKS_CONFIG.keys())
 TASK_TO_METRIC = {k:v["metric"] for k,v in TASKS_CONFIG.items()}
@@ -34,18 +39,30 @@ EXTERNAL_MODEL_TO_DIM = {k: v["dim"] for k,v in MODEL_META["model_meta"].items()
 EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)}
 EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)}
 PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)}
 SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)}
 MODELS_TO_SKIP = MODEL_META["models_to_skip"]
 PROPRIETARY_MODELS = {
     make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
     for model in PROPRIETARY_MODELS
 }
 SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
     make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
     for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS
 }
 TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS}
 for board_config in BOARDS_CONFIG.values():
@@ -164,7 +181,13 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
     # Initialize list to models that we cannot fetch metadata from
     df_list = []
     for model in EXTERNAL_MODEL_RESULTS:
-        results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
         if len(datasets) > 0:
             res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
         elif langs:
@@ -383,7 +406,10 @@ for task in TASKS:
     data[task] = {"metric": TASKS_CONFIG[task]["metric_description"], "data": []}
 for board, board_config in BOARDS_CONFIG.items():
-    board_pretty_name = f"{board_config['title']} leaderboard"
     acronym = board_config.get("acronym", None)
     board_icon = board_config.get("icon", None)
     if board_icon is None:
@@ -439,7 +465,7 @@ function(goalUrlObject) {
 def update_url_task(event: gr.SelectData, current_task_language: dict, language_per_task: dict):
     current_task_language["task"] = event.target.id
     # Either use the cached language for this task or the 1st language
-    current_task_language["language"] = language_per_task.get(event.target.id, event.target.children[0].children[0].id)
     return current_task_language, language_per_task
 def update_url_language(event: gr.SelectData, current_task_language: dict, language_per_task: dict):
@@ -461,6 +487,8 @@ MODEL_TYPES = [
     "Open",
     "Proprietary",
     "Sentence Transformers",
 ]
 def filter_data(search_query, model_types, model_sizes, *full_dataframes):
@@ -484,6 +512,10 @@ def filter_data(search_query, model_types, model_sizes, *full_dataframes):
                     masks.append(df["Model"].isin(PROPRIETARY_MODELS))
                 elif model_type == "Sentence Transformers":
                     masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS))
             if masks:
                 df = df[reduce(lambda a, b: a | b, masks)]
             else:
@@ -535,16 +567,16 @@ with gr.Blocks(css=css) as block:
     with gr.Tabs() as outer_tabs:
         # Store the tabs for updating them on load based on URL parameters
         tabs.append(outer_tabs)
         for task, task_values in data.items():
             metric = task_values["metric"]
             task_tab_id = task.lower().replace(" ", "-")
             # Overall, Bitext Mining, Classification, etc.
-            with gr.Tab(task, id=task_tab_id) as task_tab:
                 # For updating the 'task' in the URL
                 task_tab.select(update_url_task, [current_task_language, language_per_task], [current_task_language, language_per_task]).then(None, [current_task_language], [], js=set_window_url_params)
                 with gr.Tabs() as task_tabs:
                     # Store the task tabs for updating them on load based on URL parameters
                     tabs.append(task_tabs)

 BOARDS_CONFIG = LEADERBOARD_CONFIG["boards"]
 TASKS = list(TASKS_CONFIG.keys())
+PRETTY_NAMES = {
+    "InstructionRetrieval": "Retrieval w/Instructions",
+    "PairClassification": "Pair Classification",
+    "BitextMining": "Bitext Mining",
+}
 TASK_TO_METRIC = {k:v["metric"] for k,v in TASKS_CONFIG.items()}
 EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)}
 EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)}
 PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)}
+TASK_DESCRIPTIONS = {k: v["task_description"] for k,v in TASKS_CONFIG.items()}
+TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks."
 SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)}
 MODELS_TO_SKIP = MODEL_META["models_to_skip"]
+CROSS_ENCODERS = MODEL_META["cross_encoders"]
+BI_ENCODERS = [k for k, _ in MODEL_META["model_meta"].items() if k not in CROSS_ENCODERS + ["bm25"]]
 PROPRIETARY_MODELS = {
     make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
     for model in PROPRIETARY_MODELS
 }
 SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
     make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
     for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS
 }
+CROSS_ENCODERS = {
+    make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
+    for model in CROSS_ENCODERS
+}
+BI_ENCODERS = {
+    make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))
+    for model in BI_ENCODERS
+}
 TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS}
 for board_config in BOARDS_CONFIG.values():
     # Initialize list to models that we cannot fetch metadata from
     df_list = []
     for model in EXTERNAL_MODEL_RESULTS:
+        results_list = []
+        for task in tasks:
+            # Not all models have InstructionRetrieval, other new tasks
+            if task not in EXTERNAL_MODEL_RESULTS[model]:
+                continue
+            results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]
         if len(datasets) > 0:
             res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
         elif langs:
     data[task] = {"metric": TASKS_CONFIG[task]["metric_description"], "data": []}
 for board, board_config in BOARDS_CONFIG.items():
+    init_name = board_config["title"]
+    if init_name in PRETTY_NAMES:
+        init_name = PRETTY_NAMES[init_name]
+    board_pretty_name = f"{init_name} leaderboard"
     acronym = board_config.get("acronym", None)
     board_icon = board_config.get("icon", None)
     if board_icon is None:
 def update_url_task(event: gr.SelectData, current_task_language: dict, language_per_task: dict):
     current_task_language["task"] = event.target.id
     # Either use the cached language for this task or the 1st language
+    current_task_language["language"] = language_per_task.get(event.target.id, event.target.children[1].children[0].id)
     return current_task_language, language_per_task
 def update_url_language(event: gr.SelectData, current_task_language: dict, language_per_task: dict):
     "Open",
     "Proprietary",
     "Sentence Transformers",
+    "Cross-Encoders",
+    "Bi-Encoders"
 ]
 def filter_data(search_query, model_types, model_sizes, *full_dataframes):
                     masks.append(df["Model"].isin(PROPRIETARY_MODELS))
                 elif model_type == "Sentence Transformers":
                     masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS))
+                elif model_type == "Cross-Encoders":
+                    masks.append(df["Model"].isin(CROSS_ENCODERS))
+                elif model_type == "Bi-Encoders":
+                    masks.append(df["Model"].isin(BI_ENCODERS))
             if masks:
                 df = df[reduce(lambda a, b: a | b, masks)]
             else:
     with gr.Tabs() as outer_tabs:
         # Store the tabs for updating them on load based on URL parameters
         tabs.append(outer_tabs)
         for task, task_values in data.items():
             metric = task_values["metric"]
             task_tab_id = task.lower().replace(" ", "-")
             # Overall, Bitext Mining, Classification, etc.
+            pretty_task_name = task if task not in PRETTY_NAMES.keys() else PRETTY_NAMES[task]
+            with gr.Tab(pretty_task_name, id=task_tab_id) as task_tab:
                 # For updating the 'task' in the URL
                 task_tab.select(update_url_task, [current_task_language, language_per_task], [current_task_language, language_per_task]).then(None, [current_task_language], [], js=set_window_url_params)
+                gr.Markdown(TASK_DESCRIPTIONS[task])
                 with gr.Tabs() as task_tabs:
                     # Store the task tabs for updating them on load based on URL parameters
                     tabs.append(task_tabs)

config.yaml CHANGED Viewed

@@ -7,34 +7,47 @@ tasks:
     icon: "🎌"
     metric: f1
     metric_description: "[F1](https://huggingface.co/spaces/evaluate-metric/f1)"
   Classification:
     icon: "❤️"
     metric: accuracy
     metric_description: "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)"
   Clustering:
     icon: "✨"
     metric: v_measure
     metric_description: "Validity Measure (v_measure)"
   PairClassification:
     icon: "🎭"
     metric: cos_sim_ap
     metric_description: "Average Precision based on Cosine Similarities (cos_sim_ap)"
   Reranking:
     icon: "🥈"
     metric: map
     metric_description: "Mean Average Precision (MAP)"
   Retrieval:
     icon: "🔎"
     metric: ndcg_at_10
     metric_description: "Normalized Discounted Cumulative Gain @ k (ndcg_at_10)"
   STS:
     icon: "🤖"
     metric: cos_sim_spearman
     metric_description: "Spearman correlation based on cosine similarity"
   Summarization:
     icon: "📜"
     metric: cos_sim_spearman
     metric_description: "Spearman correlation	based on cosine similarity"
 boards:
   en:
     title: English
@@ -250,6 +263,18 @@ boards:
         - MassiveIntentClassification (nb)
         - MassiveScenarioClassification (nb)
         - ScalaNbClassification
   law:
     title: Law
     language_long: "English, German, Chinese"

     icon: "🎌"
     metric: f1
     metric_description: "[F1](https://huggingface.co/spaces/evaluate-metric/f1)"
+    task_description: "Bitext mining is the task of finding parallel sentences in two languages."
   Classification:
     icon: "❤️"
     metric: accuracy
     metric_description: "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)"
+    task_description: "Classification is the task of assigning a label to a text."
   Clustering:
     icon: "✨"
     metric: v_measure
     metric_description: "Validity Measure (v_measure)"
+    task_description: "Clustering is the task of grouping similar documents together."
   PairClassification:
     icon: "🎭"
     metric: cos_sim_ap
     metric_description: "Average Precision based on Cosine Similarities (cos_sim_ap)"
+    task_description: "Pair classification is the task of determining whether two texts are similar."
   Reranking:
     icon: "🥈"
     metric: map
     metric_description: "Mean Average Precision (MAP)"
+    task_description: "Reranking is the task of reordering a list of documents to improve relevance."
   Retrieval:
     icon: "🔎"
     metric: ndcg_at_10
     metric_description: "Normalized Discounted Cumulative Gain @ k (ndcg_at_10)"
+    task_description: "Retrieval is the task of finding relevant documents for a query."
   STS:
     icon: "🤖"
     metric: cos_sim_spearman
     metric_description: "Spearman correlation based on cosine similarity"
+    task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
   Summarization:
     icon: "📜"
     metric: cos_sim_spearman
     metric_description: "Spearman correlation	based on cosine similarity"
+    task_description: "Summarization is the task of generating a summary of a text."
+  InstructionRetrieval:
+    icon: "🔎📋"
+    metric: "p-MRR"
+    metric_description: "paired mean reciprocal rank"
+    task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
 boards:
   en:
     title: English
         - MassiveIntentClassification (nb)
         - MassiveScenarioClassification (nb)
         - ScalaNbClassification
+  instructions:
+    title: English
+    language_long: "English"
+    has_overall: false
+    acronym: null
+    icon: null
+    credits: "[Orion Weller, FollowIR](https://arxiv.org/abs/2403.15246)"
+    tasks:
+      InstructionRetrieval:
+      - Robust04InstructionRetrieval
+      - News21InstructionRetrieval
+      - Core17InstructionRetrieval
   law:
     title: Law
     language_long: "English, German, Chinese"

model_meta.yaml CHANGED Viewed

@@ -47,6 +47,20 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   LASER2:
     link: https://github.com/facebookresearch/LASER
     seq_len: N/A
@@ -263,6 +277,12 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   camembert-base:
     link: https://huggingface.co/almanach/camembert-base
     seq_len: 512
@@ -359,6 +379,14 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   e5-base:
     link: https://huggingface.co/intfloat/e5-base
     seq_len: 512
@@ -367,6 +395,14 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   e5-large:
     link: https://huggingface.co/intfloat/e5-large
     seq_len: 512
@@ -407,6 +443,22 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   flaubert_base_cased:
     link: https://huggingface.co/flaubert/flaubert_base_cased
     seq_len: 512
@@ -535,6 +587,22 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   komninos:
     link: https://huggingface.co/sentence-transformers/average_word_embeddings_komninos
     seq_len: N/A
@@ -543,6 +611,14 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   luotuo-bert-medium:
     link: https://huggingface.co/silk-road/luotuo-bert-medium
     seq_len: 512
@@ -567,6 +643,14 @@ model_meta:
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
   mistral-embed:
     link: https://docs.mistral.ai/guides/embeddings
     seq_len: null
@@ -575,6 +659,30 @@ model_meta:
     is_external: true
     is_proprietary: true
     is_sentence_transformers_compatible: false
   msmarco-bert-co-condensor:
     link: https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor
     seq_len: 512
@@ -903,6 +1011,22 @@ model_meta:
     is_external: true
     is_proprietary: true
     is_sentence_transformers_compatible: false
   text2vec-base-chinese:
     link: https://huggingface.co/shibing624/text2vec-base-chinese
     seq_len: 512
@@ -1184,3 +1308,13 @@ models_to_skip:
 - michaelfeil/ct2fast-gte-large
 - gizmo-ai/Cohere-embed-multilingual-v3.0
 - McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse

     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  FollowIR-7B:
+    link: https://huggingface.co/jhu-clsp/FollowIR-7B
+    seq_len: 4096
+    size: 7240
+    is_external: true
+    is_propietary: false
+    is_sentence_transformer_compatible: false
+  GritLM-7B:
+    link: https://huggingface.co/GritLM/GritLM-7B
+    seq_len: 4096
+    size: 7240
+    is_external: true
+    is_propietary: false
+    is_sentence_transformer_compatible: false
   LASER2:
     link: https://github.com/facebookresearch/LASER
     seq_len: N/A
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  bm25:
+    link: https://en.wikipedia.org/wiki/Okapi_BM25
+    size: 0
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
   camembert-base:
     link: https://huggingface.co/almanach/camembert-base
     seq_len: 512
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  e5-base-v2:
+    link: https://huggingface.co/intfloat/e5-base-v2
+    seq_len: 512
+    size: 110
+    dim: 768
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: true
   e5-base:
     link: https://huggingface.co/intfloat/e5-base
     seq_len: 512
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  e5-large-v2:
+    link: https://huggingface.co/intfloat/e5-large-v2
+    seq_len: 512
+    size: 335
+    dim: 1024
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: true
   e5-large:
     link: https://huggingface.co/intfloat/e5-large
     seq_len: 512
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  flan-t5-base:
+    link: https://huggingface.co/google/flan-t5-base
+    seq_len: 512
+    size: 220
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: true
+  flan-t5-large:
+    link: https://huggingface.co/google/flan-t5-large
+    seq_len: 512
+    size: 770
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: true
   flaubert_base_cased:
     link: https://huggingface.co/flaubert/flaubert_base_cased
     seq_len: 512
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  instructor-base:
+    link: https://huggingface.co/hkunlp/instructor-base
+    seq_len: N/A
+    size: 110
+    dim: 768
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: true
+  instructor-xl:
+    link: https://huggingface.co/hkunlp/instructor-xl
+    seq_len: N/A
+    size: 1241
+    dim: 768
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: true
   komninos:
     link: https://huggingface.co/sentence-transformers/average_word_embeddings_komninos
     seq_len: N/A
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  llama-2-7b-chat:
+    link: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+    seq_len: 4096
+    size: 7000
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
   luotuo-bert-medium:
     link: https://huggingface.co/silk-road/luotuo-bert-medium
     seq_len: 512
     is_external: true
     is_proprietary: false
     is_sentence_transformers_compatible: true
+  mistral-7b-instruct-v0.2:
+    link: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+    seq_len: 4096
+    size: 7240
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
   mistral-embed:
     link: https://docs.mistral.ai/guides/embeddings
     seq_len: null
     is_external: true
     is_proprietary: true
     is_sentence_transformers_compatible: false
+  monobert-large-msmarco:
+    link: https://huggingface.co/castorini/monobert-large-msmarco
+    seq_len: 512
+    size: 770
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
+  monot5-3b-msmarco-10k:
+    link: https://huggingface.co/castorini/monot5-3b-msmarco-10k
+    seq_len: 512
+    size: 2480
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
+  monot5-base-msmarco-10k:
+    link: https://huggingface.co/castorini/monot5-base-msmarco-10k
+    seq_len: 512
+    size: 220
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
   msmarco-bert-co-condensor:
     link: https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor
     seq_len: 512
     is_external: true
     is_proprietary: true
     is_sentence_transformers_compatible: false
+  tart-dual-contriever-msmarco:
+    link: https://huggingface.co/orionweller/tart-dual-contriever-msmarco
+    seq_len: 512
+    size: 110
+    dim: 768
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
+  tart-full-flan-t5-xl:
+    link: https://huggingface.co/facebook/tart-full-flan-t5-xl
+    seq_len: 512
+    size: 2480
+    dim: -1
+    is_external: true
+    is_proprietary: false
+    is_sentence_transformers_compatible: false
   text2vec-base-chinese:
     link: https://huggingface.co/shibing624/text2vec-base-chinese
     seq_len: 512
 - michaelfeil/ct2fast-gte-large
 - gizmo-ai/Cohere-embed-multilingual-v3.0
 - McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse
+cross_encoders:
+- FollowIR-7B
+- flan-t5-base
+- flan-t5-large
+- monobert-large-msmarco
+- monot5-3b-msmarco-10k
+- monot5-base-msmarco-10k
+- llama-2-7b-chat
+- mistral-7b-instruct-v0.2
+- tart-full-flan-t5-xl