Spaces:

clibrain
/

Spanish-Embeddings-Leaderboard

Runtime error

App Files Files Community

Santi Diana commited on Sep 25, 2023

Commit

b8e8c93

1 Parent(s): 82d6c9a

Feat: add new model implemented. Read README.md inside add_new_model folder for more information

Browse files

Files changed (9) hide show

.DS_Store +0 -0
add_new_model/MTEB_results_to_yaml.py +122 -0
add_new_model/README.md +10 -0
add_new_model/add_new_model.py +93 -0
add_new_model/metadata_example/mteb_metadata.yaml +114 -0
app.py +26 -25
data/classification.csv +8 -0
data/general.csv +8 -0
data/sts.csv +8 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

add_new_model/MTEB_results_to_yaml.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Usage: python mteb_meta.py path_to_results_folder
+Creates evaluation results metadata for the model card.
+E.g.
+---
+tags:
+- mteb
+model-index:
+- name: SGPT-5.8B-weightedmean-msmarco-specb-bitfit
+  results:
+  - task:
+      type: classification
+    dataset:
+      type: mteb/banking77
+      name: MTEB Banking77
+      config: default
+      split: test
+      revision: 44fa15921b4c889113cc5df03dd4901b49161ab7
+    metrics:
+    - type: accuracy
+      value: 84.49350649350649
+---
+"""
+import json
+import logging
+import os
+import sys
+from mteb import MTEB
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+results_folder = sys.argv[1].rstrip("/")
+model_name = results_folder.split("/")[-1]
+all_results = {}
+for file_name in os.listdir(results_folder):
+    if not file_name.endswith(".json"):
+        logger.info(f"Skipping non-json {file_name}")
+        continue
+    with open(os.path.join(results_folder, file_name), "r", encoding="utf-8") as f:
+        results = json.load(f)
+        all_results = {**all_results, **{file_name.replace(".json", ""): results}}
+# Use "train" split instead
+TRAIN_SPLIT = ["DanishPoliticalCommentsClassification"]
+# Use "validation" split instead
+VALIDATION_SPLIT = ["AFQMC", "Cmnli", "IFlyTek", "TNews", "MSMARCO", "MultilingualSentiment", "Ocnli"]
+# Use "dev" split instead
+DEV_SPLIT = ["CmedqaRetrieval", "CovidRetrieval", "DuRetrieval", "EcomRetrieval", "MedicalRetrieval", "MMarcoReranking", "MMarcoRetrieval", "MSMARCO", "T2Reranking", "T2Retrieval", "VideoRetrieval"]
+MARKER = "---"
+TAGS = "tags:"
+MTEB_TAG = "- mteb"
+HEADER = "model-index:"
+MODEL = f"- name: {model_name}"
+RES = "  results:"
+META_STRING = "\n".join([MARKER, TAGS, MTEB_TAG, HEADER, MODEL, RES])
+ONE_TASK = "  - task:\n      type: {}\n    dataset:\n      type: {}\n      name: {}\n      config: {}\n      split: {}\n      revision: {}\n    metrics:"
+ONE_METRIC = "    - type: {}\n      value: {}"
+SKIP_KEYS = ["std", "evaluation_time", "main_score", "threshold"]
+for ds_name, res_dict in sorted(all_results.items()):
+    mteb_desc = (
+        MTEB(tasks=[ds_name.replace("CQADupstackRetrieval", "CQADupstackAndroidRetrieval")]).tasks[0].description
+    )
+    hf_hub_name = mteb_desc.get("hf_hub_name", mteb_desc.get("beir_name"))
+    if "CQADupstack" in ds_name:
+        hf_hub_name = "BeIR/cqadupstack"
+    mteb_type = mteb_desc["type"]
+    revision = res_dict.get("dataset_revision")  # Okay if it's None
+    split = "test"
+    if (ds_name in TRAIN_SPLIT) and ("train" in res_dict):
+        split = "train"
+    elif (ds_name in VALIDATION_SPLIT) and ("validation" in res_dict):
+        split = "validation"
+    elif (ds_name in DEV_SPLIT) and ("dev" in res_dict):
+        split = "dev"
+    elif "test" not in res_dict:
+        logger.info(f"Skipping {ds_name} as split {split} not present.")
+        continue
+    res_dict = res_dict.get(split)
+    for lang in mteb_desc["eval_langs"]:
+        mteb_name = f"MTEB {ds_name}"
+        mteb_name += f" ({lang})" if len(mteb_desc["eval_langs"]) > 1 else ""
+        # For English there is no language key if it's the only language
+        test_result_lang = res_dict.get(lang) if len(mteb_desc["eval_langs"]) > 1 else res_dict
+        # Skip if the language was not found but it has other languages
+        if test_result_lang is None:
+            continue
+        META_STRING += "\n" + ONE_TASK.format(
+            mteb_type, hf_hub_name, mteb_name, lang if len(mteb_desc["eval_langs"]) > 1 else "default", split, revision
+        )
+        for metric, score in test_result_lang.items():
+            if not isinstance(score, dict):
+                score = {metric: score}
+            for sub_metric, sub_score in score.items():
+                if any([x in sub_metric for x in SKIP_KEYS]):
+                    continue
+                META_STRING += "\n" + ONE_METRIC.format(
+                    f"{metric}_{sub_metric}" if metric != sub_metric else metric,
+                    # All MTEB scores are 0-1, multiply them by 100 for 3 reasons:
+                    # 1) It's easier to visually digest (You need two chars less: "0.1" -> "1")
+                    # 2) Others may multiply them by 100, when building on MTEB making it confusing what the range is
+                    # This happend with Text and Code Embeddings paper (OpenAI) vs original BEIR paper
+                    # 3) It's accepted practice (SuperGLUE, GLUE are 0-100)
+                    sub_score * 100,
+                )
+META_STRING += "\n" + MARKER
+if os.path.exists(f"./mteb_metadata.yaml"):
+    logger.warning("Overwriting mteb_metadata.md")
+with open(f"./mteb_metadata.yaml", "w") as f:
+    f.write(META_STRING)

add_new_model/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+## How to add a new model to the Leaderboard
+Here we are going to explain how to add a new model to the Leaderboard. The next steps must be followed:
+1. `Git clone` this repository and `cd add_new_model`.
+2. Perform evaluation of the new model using MTEB library. That will give you a folder as result. Example: `sentence-t5-large` is the folder name
+when evaluating `sentence-transformers/sentence-t5-large`.
+3. Once evaluated, move that folder to this folder, so it will be inside `add_new_model` folder.
+4. Execute the file `MTEB_metadata_to_yaml.py`. That will create a file named `mteb_medadata.yaml` that contains the metadata regarding your evaluation.
+5. Execute the file `add_new_model.py`. That file will add your model to the Leaderboard.
+6. Add, commit and `git push` the changes without uploading the results and the `mteb_metadata.yaml`.

add_new_model/add_new_model.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import pandas as pd
+import yaml
+import numpy as np
+def add_model(metadata_archive):
+    """
+    Esto actualiza el archivo del cual app.py coge la información para crear la leaderboard. Entonces, cuando
+    alguien quiera añadir un nuevo modelo, tiene que ejecutar este archivo.
+    1. Leemos el CSV, sacamos información y añadimos simplemente una nueva row.
+    """
+    # Initialize an empty DataFrame
+    df = pd.DataFrame(columns=['dataset_name', 'Accuracy', 'Spearman', "Category"])
+    metadata_archive = 'mteb_metadata.yaml'
+    with open(metadata_archive, 'r') as file:
+        for index, data in enumerate(yaml.safe_load_all(file)):
+            if index == 0:
+                model_index_list = data.get('model-index', [])
+                model_name = model_index_list[0].get('name')
+                results_list = model_index_list[0].get('results', [])
+                if results_list:
+                    for i in range(len(results_list)):
+                        task = results_list[i].get('task', {})
+                        task_name = task.get("type")
+                        dataset_name = results_list[i]['dataset']['name']
+                        # Initialize the row with NaN values
+                        row = {'dataset_name': dataset_name, 'Accuracy': None, 'Spearman': None}
+                        if task_name == "Classification":
+                            accuracy = next((metric.get('value') for metric in results_list[i].get('metrics', []) if metric.get('type') == 'accuracy'), None)
+                            row['Accuracy'] = accuracy
+                            row['Category'] = "Classification"
+                        elif task_name == "STS":
+                            spearman = next((metric.get('value') for metric in results_list[i].get('metrics', []) if metric.get('type') == 'cos_sim_spearman'), None)
+                            row['Spearman'] = spearman
+                            row["Category"] = "STS"
+                        # Append the row to the DataFrame using pd.concat
+                        new_df = pd.DataFrame([row])
+                        df = pd.concat([df, new_df], ignore_index=True)
+    df['Accuracy'] = pd.to_numeric(df['Accuracy'], errors='coerce')
+    classification_average = round(df.loc[df['Category'] == 'Classification', 'Accuracy'].mean(),2)
+    df['Spearman'] = pd.to_numeric(df['Spearman'], errors='coerce')
+    sts_spearman_average = round(df.loc[df['Category'] == 'STS', 'Spearman'].mean(),2)
+    ## CLASSIFICATION
+    classification_dataframe = pd.read_csv('../data/classification.csv')
+    classification_df = df[df['Category']== 'Classification']
+    new_row_data = {'Model name': model_name}
+    for index, row in classification_df.iterrows():
+        column_name = row['dataset_name']
+        accuracy_value = row['Accuracy']
+        new_row_data[column_name] = round(accuracy_value,2)
+    new_row_df = pd.DataFrame(new_row_data,index=[0])
+    classification_dataframe = pd.concat([classification_dataframe,new_row_df],ignore_index=True)
+    classification_dataframe.to_csv("../data/classification.csv",index=False)
+    ## STS
+    sts_dataframe = pd.read_csv('../data/sts.csv')
+    sts_df = df[df['Category']=='STS']
+    new_row_data = {'Model name': model_name}
+    for index, row in sts_df.iterrows():
+        column_name = row['dataset_name']
+        spearman_value = row['Spearman']
+        new_row_data[column_name] = round(spearman_value,2)
+    new_row_df = pd.DataFrame(new_row_data,index = [0])
+    sts_dataframe = pd.concat([sts_dataframe,new_row_df],ignore_index=True)
+    sts_dataframe.to_csv('../data/sts.csv',index=False)
+    ## GENERAL
+    general_dataframe = pd.read_csv("../data/general.csv")
+    average = round(np.mean([classification_average,sts_spearman_average]),2)
+    ## TODO: solucionar la meta-data como Model Size o Embedding Dimensions.
+    new_instance = {'Model name':model_name, 'Model Size (GB)': None, 'Embedding Dimensions': None, 'Average':average, 'Classification Average': classification_average, 'Clustering Average': None, 'STS Average': sts_spearman_average, 'Retrieval Average': None}
+    new_row_df = pd.DataFrame(new_instance, index=[0])
+    general_dataframe = pd.concat([general_dataframe, new_row_df], ignore_index=True)
+    general_dataframe.to_csv("../data/general.csv",index=False)
+add_model('mteb_metadata.yaml')

add_new_model/metadata_example/mteb_metadata.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+---
+tags:
+- mteb
+model-index:
+- name: sentence-t5-xl
+  results:
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_reviews_multi
+      name: MTEB AmazonReviewsClassification (es)
+      config: es
+      split: test
+      revision: 1399c76144fd37290681b995c656ef9b2e06e26d
+    metrics:
+    - type: accuracy
+      value: 45.007999999999996
+    - type: f1
+      value: 41.6679637623569
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/mtop_domain
+      name: MTEB MTOPDomainClassification (es)
+      config: es
+      split: test
+      revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
+    metrics:
+    - type: accuracy
+      value: 85.32354903268846
+    - type: f1
+      value: 85.23439986563692
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/mtop_intent
+      name: MTEB MTOPIntentClassification (es)
+      config: es
+      split: test
+      revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
+    metrics:
+    - type: accuracy
+      value: 57.384923282188126
+    - type: f1
+      value: 38.1008046822733
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_intent
+      name: MTEB MassiveIntentClassification (es)
+      config: es
+      split: test
+      revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
+    metrics:
+    - type: accuracy
+      value: 57.96906523201076
+    - type: f1
+      value: 57.053434089481605
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_scenario
+      name: MTEB MassiveScenarioClassification (es)
+      config: es
+      split: test
+      revision: 7d571f92784cd94a019292a1f45445077d0ef634
+    metrics:
+    - type: accuracy
+      value: 62.51513113651648
+    - type: f1
+      value: 61.428522227301464
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts17-crosslingual-sts
+      name: MTEB STS17 (es-es)
+      config: es-es
+      split: test
+      revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
+    metrics:
+    - type: cos_sim_pearson
+      value: 83.7632102444147
+    - type: cos_sim_spearman
+      value: 83.41808607885294
+    - type: euclidean_pearson
+      value: 84.2318059368248
+    - type: euclidean_spearman
+      value: 83.41874306738518
+    - type: manhattan_pearson
+      value: 84.31088958713279
+    - type: manhattan_spearman
+      value: 83.41585915763147
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts22-crosslingual-sts
+      name: MTEB STS22 (es)
+      config: es
+      split: test
+      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
+    metrics:
+    - type: cos_sim_pearson
+      value: 49.40174203847368
+    - type: cos_sim_spearman
+      value: 58.15600173312334
+    - type: euclidean_pearson
+      value: 53.967323698454365
+    - type: euclidean_spearman
+      value: 58.15600173312334
+    - type: manhattan_pearson
+      value: 53.89976435331337
+    - type: manhattan_spearman
+      value: 58.187134535671284
+---

app.py CHANGED Viewed

@@ -2,9 +2,29 @@ import gradio as gr
 import pandas as pd
 block = gr.Blocks()
-NUM_DATASETS = 0
 NUM_SCORES = 0
-NUM_MODELS = 0
 with block:
     gr.Markdown(f"""**Leaderboard de modelos de Embeddings en español
@@ -24,16 +44,7 @@ with block:
                     - **Idioma:** Español
                     """)
             with gr.Row():
-                overall = pd.DataFrame({
-                    'Model name': ['hiiamsid/sentence_similarity_spanish_es', 'clibrain/paraphrase-multilingual-mpnet-base-v2-ft-stsb_multi_mt-embeddings', 'intfloat/mulilingua-e5-large','hackathon-pln-es/paraphrase-spanish-distilroberta'],
-                    'Model Size (GB)': 100,
-                    'Embedding Dimensions': 100,
-                    'Average': 56,
-                    'Classification Average': 55,
-                    "Clustering Average": 50,
-                    'STS Average': 40,
-                    "Retrieval Average": 30
-                })
                 data_overall = gr.components.Dataframe(
                         overall,
                         type="pandas",
@@ -42,14 +53,7 @@ with block:
         with gr.TabItem("Classification"):
             with gr.Row():
                 # Create and display a sample DataFrame
-                classification = pd.DataFrame({
-                    'Model name': ['hiiamsid/sentence_similarity_spanish_es', 'clibrain/paraphrase-multilingual-mpnet-base-v2-ft-stsb_multi_mt-embeddings', 'intfloat/mulilingua-e5-large','hackathon-pln-es/paraphrase-spanish-distilroberta'],
-                    'AmazonReviewsClassification': 100,
-                    'MTOPDomainClassification': 100,
-                    'MassiveIntentClassification': 56,
-                    'MassiveScenarioClassification': 55,
-                    "MTOPIntentClassification": 50,
-                })
                 data_overall = gr.components.Dataframe(
                         classification,
                         type="pandas",
@@ -58,11 +62,7 @@ with block:
         with gr.TabItem("STS"):
             with gr.Row():
                 # Create and display a sample DataFrame
-                sts = pd.DataFrame({
-                    'Model name': ['hiiamsid/sentence_similarity_spanish_es', 'clibrain/paraphrase-multilingual-mpnet-base-v2-ft-stsb_multi_mt-embeddings', 'intfloat/mulilingua-e5-large','hackathon-pln-es/paraphrase-spanish-distilroberta'],
-                    'STS22': 100,
-                    'STS17': 100,
-                })
                 data_overall = gr.components.Dataframe(
                         sts,
                         type="pandas",
@@ -70,3 +70,4 @@ with block:
                     )
 block.launch()

 import pandas as pd
 block = gr.Blocks()
+NUM_DATASETS = 7
 NUM_SCORES = 0
+NUM_MODELS = 5
+def general_dataframe_update():
+    """
+    Returns general dataframe for general table.
+    """
+    dataframe = pd.read_csv('data/general.csv')
+    return dataframe
+def classification_dataframe_update():
+    """
+    Returns classification dataframe for classification table.
+    """
+    dataframe = pd.read_csv('data/classification.csv')
+    return dataframe
+def sts_dataframe_udpate():
+    """
+    Returns sts dataframe for sts table.
+    """
+    dataframe = pd.read_csv('data/sts.csv')
+    return dataframe
 with block:
     gr.Markdown(f"""**Leaderboard de modelos de Embeddings en español
                     - **Idioma:** Español
                     """)
             with gr.Row():
+                overall = general_dataframe_update()
                 data_overall = gr.components.Dataframe(
                         overall,
                         type="pandas",
         with gr.TabItem("Classification"):
             with gr.Row():
                 # Create and display a sample DataFrame
+                classification = classification_dataframe_update()
                 data_overall = gr.components.Dataframe(
                         classification,
                         type="pandas",
         with gr.TabItem("STS"):
             with gr.Row():
                 # Create and display a sample DataFrame
+                sts = sts_dataframe_udpate()
                 data_overall = gr.components.Dataframe(
                         sts,
                         type="pandas",
                     )
 block.launch()

data/classification.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+Model name,MTEB AmazonReviewsClassification (es),MTEB MTOPDomainClassification (es),MTEB MTOPIntentClassification (es),MTEB MassiveIntentClassification (es),MTEB MassiveScenarioClassification (es)
+multilingual-e5-large,42.66,89.95,66.84,64.68,68.85
+bge-small-en-v1.5,32.03,76.93,52.15,48.77,54.42
+multilingual-e5-base,42.47,89.62,60.27,60.51,66.52
+multilingual-e5-small,41.3,87.33,55.87,58.06,63.1
+paraphrase-multilingual-mpnet-base-v2,39.99,86.96,66.59,64.43,70.42
+sentence-t5-large,42.89,80.78,52.07,54.1,59.56
+sentence-t5-xl,45.01,85.32,57.38,57.97,62.52

data/general.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+Model name,Model Size (GB),Embedding Dimensions,Average,Classification Average,Clustering Average,STS Average,Retrieval Average
+multilingual-e5-large,,,72.22,66.6,,77.83,
+bge-small-en-v1.5,,,59.73,52.86,,66.6,
+multilingual-e5-base,,,70.7,63.88,,77.53,
+multilingual-e5-small,,,68.64,61.13,,76.15,
+paraphrase-multilingual-mpnet-base-v2,,,69.1,65.68,,72.53,
+sentence-t5-large,,,64.04,57.88,,70.21,
+sentence-t5-xl,,,66.22,61.64,,70.79,

data/sts.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+Model name,MTEB STS17 (es-es),MTEB STS22 (es)
+multilingual-e5-large,87.42,68.23
+bge-small-en-v1.5,77.73,55.47
+multilingual-e5-base,87.26,67.79
+multilingual-e5-small,85.27,67.04
+paraphrase-multilingual-mpnet-base-v2,85.14,59.91
+sentence-t5-large,82.74,57.68
+sentence-t5-xl,83.42,58.16