Santi Diana commited on
Commit
b8e8c93
1 Parent(s): 82d6c9a

Feat: add new model implemented. Read README.md inside add_new_model folder for more information

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
add_new_model/MTEB_results_to_yaml.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage: python mteb_meta.py path_to_results_folder
3
+
4
+ Creates evaluation results metadata for the model card.
5
+ E.g.
6
+ ---
7
+ tags:
8
+ - mteb
9
+ model-index:
10
+ - name: SGPT-5.8B-weightedmean-msmarco-specb-bitfit
11
+ results:
12
+ - task:
13
+ type: classification
14
+ dataset:
15
+ type: mteb/banking77
16
+ name: MTEB Banking77
17
+ config: default
18
+ split: test
19
+ revision: 44fa15921b4c889113cc5df03dd4901b49161ab7
20
+ metrics:
21
+ - type: accuracy
22
+ value: 84.49350649350649
23
+ ---
24
+ """
25
+
26
+ import json
27
+ import logging
28
+ import os
29
+ import sys
30
+
31
+ from mteb import MTEB
32
+
33
+ logging.basicConfig(level=logging.INFO)
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ results_folder = sys.argv[1].rstrip("/")
38
+ model_name = results_folder.split("/")[-1]
39
+
40
+ all_results = {}
41
+
42
+ for file_name in os.listdir(results_folder):
43
+ if not file_name.endswith(".json"):
44
+ logger.info(f"Skipping non-json {file_name}")
45
+ continue
46
+ with open(os.path.join(results_folder, file_name), "r", encoding="utf-8") as f:
47
+ results = json.load(f)
48
+ all_results = {**all_results, **{file_name.replace(".json", ""): results}}
49
+
50
+ # Use "train" split instead
51
+ TRAIN_SPLIT = ["DanishPoliticalCommentsClassification"]
52
+ # Use "validation" split instead
53
+ VALIDATION_SPLIT = ["AFQMC", "Cmnli", "IFlyTek", "TNews", "MSMARCO", "MultilingualSentiment", "Ocnli"]
54
+ # Use "dev" split instead
55
+ DEV_SPLIT = ["CmedqaRetrieval", "CovidRetrieval", "DuRetrieval", "EcomRetrieval", "MedicalRetrieval", "MMarcoReranking", "MMarcoRetrieval", "MSMARCO", "T2Reranking", "T2Retrieval", "VideoRetrieval"]
56
+
57
+ MARKER = "---"
58
+ TAGS = "tags:"
59
+ MTEB_TAG = "- mteb"
60
+ HEADER = "model-index:"
61
+ MODEL = f"- name: {model_name}"
62
+ RES = " results:"
63
+
64
+ META_STRING = "\n".join([MARKER, TAGS, MTEB_TAG, HEADER, MODEL, RES])
65
+
66
+
67
+ ONE_TASK = " - task:\n type: {}\n dataset:\n type: {}\n name: {}\n config: {}\n split: {}\n revision: {}\n metrics:"
68
+ ONE_METRIC = " - type: {}\n value: {}"
69
+ SKIP_KEYS = ["std", "evaluation_time", "main_score", "threshold"]
70
+
71
+ for ds_name, res_dict in sorted(all_results.items()):
72
+ mteb_desc = (
73
+ MTEB(tasks=[ds_name.replace("CQADupstackRetrieval", "CQADupstackAndroidRetrieval")]).tasks[0].description
74
+ )
75
+ hf_hub_name = mteb_desc.get("hf_hub_name", mteb_desc.get("beir_name"))
76
+ if "CQADupstack" in ds_name:
77
+ hf_hub_name = "BeIR/cqadupstack"
78
+ mteb_type = mteb_desc["type"]
79
+ revision = res_dict.get("dataset_revision") # Okay if it's None
80
+ split = "test"
81
+ if (ds_name in TRAIN_SPLIT) and ("train" in res_dict):
82
+ split = "train"
83
+ elif (ds_name in VALIDATION_SPLIT) and ("validation" in res_dict):
84
+ split = "validation"
85
+ elif (ds_name in DEV_SPLIT) and ("dev" in res_dict):
86
+ split = "dev"
87
+ elif "test" not in res_dict:
88
+ logger.info(f"Skipping {ds_name} as split {split} not present.")
89
+ continue
90
+ res_dict = res_dict.get(split)
91
+ for lang in mteb_desc["eval_langs"]:
92
+ mteb_name = f"MTEB {ds_name}"
93
+ mteb_name += f" ({lang})" if len(mteb_desc["eval_langs"]) > 1 else ""
94
+ # For English there is no language key if it's the only language
95
+ test_result_lang = res_dict.get(lang) if len(mteb_desc["eval_langs"]) > 1 else res_dict
96
+ # Skip if the language was not found but it has other languages
97
+ if test_result_lang is None:
98
+ continue
99
+ META_STRING += "\n" + ONE_TASK.format(
100
+ mteb_type, hf_hub_name, mteb_name, lang if len(mteb_desc["eval_langs"]) > 1 else "default", split, revision
101
+ )
102
+ for metric, score in test_result_lang.items():
103
+ if not isinstance(score, dict):
104
+ score = {metric: score}
105
+ for sub_metric, sub_score in score.items():
106
+ if any([x in sub_metric for x in SKIP_KEYS]):
107
+ continue
108
+ META_STRING += "\n" + ONE_METRIC.format(
109
+ f"{metric}_{sub_metric}" if metric != sub_metric else metric,
110
+ # All MTEB scores are 0-1, multiply them by 100 for 3 reasons:
111
+ # 1) It's easier to visually digest (You need two chars less: "0.1" -> "1")
112
+ # 2) Others may multiply them by 100, when building on MTEB making it confusing what the range is
113
+ # This happend with Text and Code Embeddings paper (OpenAI) vs original BEIR paper
114
+ # 3) It's accepted practice (SuperGLUE, GLUE are 0-100)
115
+ sub_score * 100,
116
+ )
117
+
118
+ META_STRING += "\n" + MARKER
119
+ if os.path.exists(f"./mteb_metadata.yaml"):
120
+ logger.warning("Overwriting mteb_metadata.md")
121
+ with open(f"./mteb_metadata.yaml", "w") as f:
122
+ f.write(META_STRING)
add_new_model/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ## How to add a new model to the Leaderboard
2
+
3
+ Here we are going to explain how to add a new model to the Leaderboard. The next steps must be followed:
4
+ 1. `Git clone` this repository and `cd add_new_model`.
5
+ 2. Perform evaluation of the new model using MTEB library. That will give you a folder as result. Example: `sentence-t5-large` is the folder name
6
+ when evaluating `sentence-transformers/sentence-t5-large`.
7
+ 3. Once evaluated, move that folder to this folder, so it will be inside `add_new_model` folder.
8
+ 4. Execute the file `MTEB_metadata_to_yaml.py`. That will create a file named `mteb_medadata.yaml` that contains the metadata regarding your evaluation.
9
+ 5. Execute the file `add_new_model.py`. That file will add your model to the Leaderboard.
10
+ 6. Add, commit and `git push` the changes without uploading the results and the `mteb_metadata.yaml`.
add_new_model/add_new_model.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import yaml
4
+ import numpy as np
5
+
6
+ def add_model(metadata_archive):
7
+ """
8
+ Esto actualiza el archivo del cual app.py coge la información para crear la leaderboard. Entonces, cuando
9
+ alguien quiera añadir un nuevo modelo, tiene que ejecutar este archivo.
10
+
11
+ 1. Leemos el CSV, sacamos información y añadimos simplemente una nueva row.
12
+
13
+ """
14
+ # Initialize an empty DataFrame
15
+ df = pd.DataFrame(columns=['dataset_name', 'Accuracy', 'Spearman', "Category"])
16
+
17
+ metadata_archive = 'mteb_metadata.yaml'
18
+
19
+ with open(metadata_archive, 'r') as file:
20
+ for index, data in enumerate(yaml.safe_load_all(file)):
21
+ if index == 0:
22
+ model_index_list = data.get('model-index', [])
23
+ model_name = model_index_list[0].get('name')
24
+ results_list = model_index_list[0].get('results', [])
25
+
26
+ if results_list:
27
+ for i in range(len(results_list)):
28
+ task = results_list[i].get('task', {})
29
+ task_name = task.get("type")
30
+ dataset_name = results_list[i]['dataset']['name']
31
+
32
+ # Initialize the row with NaN values
33
+ row = {'dataset_name': dataset_name, 'Accuracy': None, 'Spearman': None}
34
+
35
+ if task_name == "Classification":
36
+ accuracy = next((metric.get('value') for metric in results_list[i].get('metrics', []) if metric.get('type') == 'accuracy'), None)
37
+ row['Accuracy'] = accuracy
38
+ row['Category'] = "Classification"
39
+ elif task_name == "STS":
40
+ spearman = next((metric.get('value') for metric in results_list[i].get('metrics', []) if metric.get('type') == 'cos_sim_spearman'), None)
41
+ row['Spearman'] = spearman
42
+ row["Category"] = "STS"
43
+
44
+ # Append the row to the DataFrame using pd.concat
45
+ new_df = pd.DataFrame([row])
46
+ df = pd.concat([df, new_df], ignore_index=True)
47
+
48
+ df['Accuracy'] = pd.to_numeric(df['Accuracy'], errors='coerce')
49
+ classification_average = round(df.loc[df['Category'] == 'Classification', 'Accuracy'].mean(),2)
50
+
51
+ df['Spearman'] = pd.to_numeric(df['Spearman'], errors='coerce')
52
+ sts_spearman_average = round(df.loc[df['Category'] == 'STS', 'Spearman'].mean(),2)
53
+
54
+ ## CLASSIFICATION
55
+ classification_dataframe = pd.read_csv('../data/classification.csv')
56
+ classification_df = df[df['Category']== 'Classification']
57
+ new_row_data = {'Model name': model_name}
58
+
59
+ for index, row in classification_df.iterrows():
60
+ column_name = row['dataset_name']
61
+ accuracy_value = row['Accuracy']
62
+ new_row_data[column_name] = round(accuracy_value,2)
63
+
64
+ new_row_df = pd.DataFrame(new_row_data,index=[0])
65
+ classification_dataframe = pd.concat([classification_dataframe,new_row_df],ignore_index=True)
66
+ classification_dataframe.to_csv("../data/classification.csv",index=False)
67
+
68
+ ## STS
69
+ sts_dataframe = pd.read_csv('../data/sts.csv')
70
+ sts_df = df[df['Category']=='STS']
71
+ new_row_data = {'Model name': model_name}
72
+
73
+ for index, row in sts_df.iterrows():
74
+ column_name = row['dataset_name']
75
+ spearman_value = row['Spearman']
76
+ new_row_data[column_name] = round(spearman_value,2)
77
+
78
+ new_row_df = pd.DataFrame(new_row_data,index = [0])
79
+ sts_dataframe = pd.concat([sts_dataframe,new_row_df],ignore_index=True)
80
+ sts_dataframe.to_csv('../data/sts.csv',index=False)
81
+
82
+ ## GENERAL
83
+ general_dataframe = pd.read_csv("../data/general.csv")
84
+
85
+ average = round(np.mean([classification_average,sts_spearman_average]),2)
86
+ ## TODO: solucionar la meta-data como Model Size o Embedding Dimensions.
87
+ new_instance = {'Model name':model_name, 'Model Size (GB)': None, 'Embedding Dimensions': None, 'Average':average, 'Classification Average': classification_average, 'Clustering Average': None, 'STS Average': sts_spearman_average, 'Retrieval Average': None}
88
+ new_row_df = pd.DataFrame(new_instance, index=[0])
89
+ general_dataframe = pd.concat([general_dataframe, new_row_df], ignore_index=True)
90
+ general_dataframe.to_csv("../data/general.csv",index=False)
91
+
92
+ add_model('mteb_metadata.yaml')
93
+
add_new_model/metadata_example/mteb_metadata.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - mteb
4
+ model-index:
5
+ - name: sentence-t5-xl
6
+ results:
7
+ - task:
8
+ type: Classification
9
+ dataset:
10
+ type: mteb/amazon_reviews_multi
11
+ name: MTEB AmazonReviewsClassification (es)
12
+ config: es
13
+ split: test
14
+ revision: 1399c76144fd37290681b995c656ef9b2e06e26d
15
+ metrics:
16
+ - type: accuracy
17
+ value: 45.007999999999996
18
+ - type: f1
19
+ value: 41.6679637623569
20
+ - task:
21
+ type: Classification
22
+ dataset:
23
+ type: mteb/mtop_domain
24
+ name: MTEB MTOPDomainClassification (es)
25
+ config: es
26
+ split: test
27
+ revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf
28
+ metrics:
29
+ - type: accuracy
30
+ value: 85.32354903268846
31
+ - type: f1
32
+ value: 85.23439986563692
33
+ - task:
34
+ type: Classification
35
+ dataset:
36
+ type: mteb/mtop_intent
37
+ name: MTEB MTOPIntentClassification (es)
38
+ config: es
39
+ split: test
40
+ revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba
41
+ metrics:
42
+ - type: accuracy
43
+ value: 57.384923282188126
44
+ - type: f1
45
+ value: 38.1008046822733
46
+ - task:
47
+ type: Classification
48
+ dataset:
49
+ type: mteb/amazon_massive_intent
50
+ name: MTEB MassiveIntentClassification (es)
51
+ config: es
52
+ split: test
53
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
54
+ metrics:
55
+ - type: accuracy
56
+ value: 57.96906523201076
57
+ - type: f1
58
+ value: 57.053434089481605
59
+ - task:
60
+ type: Classification
61
+ dataset:
62
+ type: mteb/amazon_massive_scenario
63
+ name: MTEB MassiveScenarioClassification (es)
64
+ config: es
65
+ split: test
66
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
67
+ metrics:
68
+ - type: accuracy
69
+ value: 62.51513113651648
70
+ - type: f1
71
+ value: 61.428522227301464
72
+ - task:
73
+ type: STS
74
+ dataset:
75
+ type: mteb/sts17-crosslingual-sts
76
+ name: MTEB STS17 (es-es)
77
+ config: es-es
78
+ split: test
79
+ revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d
80
+ metrics:
81
+ - type: cos_sim_pearson
82
+ value: 83.7632102444147
83
+ - type: cos_sim_spearman
84
+ value: 83.41808607885294
85
+ - type: euclidean_pearson
86
+ value: 84.2318059368248
87
+ - type: euclidean_spearman
88
+ value: 83.41874306738518
89
+ - type: manhattan_pearson
90
+ value: 84.31088958713279
91
+ - type: manhattan_spearman
92
+ value: 83.41585915763147
93
+ - task:
94
+ type: STS
95
+ dataset:
96
+ type: mteb/sts22-crosslingual-sts
97
+ name: MTEB STS22 (es)
98
+ config: es
99
+ split: test
100
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
101
+ metrics:
102
+ - type: cos_sim_pearson
103
+ value: 49.40174203847368
104
+ - type: cos_sim_spearman
105
+ value: 58.15600173312334
106
+ - type: euclidean_pearson
107
+ value: 53.967323698454365
108
+ - type: euclidean_spearman
109
+ value: 58.15600173312334
110
+ - type: manhattan_pearson
111
+ value: 53.89976435331337
112
+ - type: manhattan_spearman
113
+ value: 58.187134535671284
114
+ ---
app.py CHANGED
@@ -2,9 +2,29 @@ import gradio as gr
2
  import pandas as pd
3
 
4
  block = gr.Blocks()
5
- NUM_DATASETS = 0
6
  NUM_SCORES = 0
7
- NUM_MODELS = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  with block:
10
  gr.Markdown(f"""**Leaderboard de modelos de Embeddings en español
@@ -24,16 +44,7 @@ with block:
24
  - **Idioma:** Español
25
  """)
26
  with gr.Row():
27
- overall = pd.DataFrame({
28
- 'Model name': ['hiiamsid/sentence_similarity_spanish_es', 'clibrain/paraphrase-multilingual-mpnet-base-v2-ft-stsb_multi_mt-embeddings', 'intfloat/mulilingua-e5-large','hackathon-pln-es/paraphrase-spanish-distilroberta'],
29
- 'Model Size (GB)': 100,
30
- 'Embedding Dimensions': 100,
31
- 'Average': 56,
32
- 'Classification Average': 55,
33
- "Clustering Average": 50,
34
- 'STS Average': 40,
35
- "Retrieval Average": 30
36
- })
37
  data_overall = gr.components.Dataframe(
38
  overall,
39
  type="pandas",
@@ -42,14 +53,7 @@ with block:
42
  with gr.TabItem("Classification"):
43
  with gr.Row():
44
  # Create and display a sample DataFrame
45
- classification = pd.DataFrame({
46
- 'Model name': ['hiiamsid/sentence_similarity_spanish_es', 'clibrain/paraphrase-multilingual-mpnet-base-v2-ft-stsb_multi_mt-embeddings', 'intfloat/mulilingua-e5-large','hackathon-pln-es/paraphrase-spanish-distilroberta'],
47
- 'AmazonReviewsClassification': 100,
48
- 'MTOPDomainClassification': 100,
49
- 'MassiveIntentClassification': 56,
50
- 'MassiveScenarioClassification': 55,
51
- "MTOPIntentClassification": 50,
52
- })
53
  data_overall = gr.components.Dataframe(
54
  classification,
55
  type="pandas",
@@ -58,11 +62,7 @@ with block:
58
  with gr.TabItem("STS"):
59
  with gr.Row():
60
  # Create and display a sample DataFrame
61
- sts = pd.DataFrame({
62
- 'Model name': ['hiiamsid/sentence_similarity_spanish_es', 'clibrain/paraphrase-multilingual-mpnet-base-v2-ft-stsb_multi_mt-embeddings', 'intfloat/mulilingua-e5-large','hackathon-pln-es/paraphrase-spanish-distilroberta'],
63
- 'STS22': 100,
64
- 'STS17': 100,
65
- })
66
  data_overall = gr.components.Dataframe(
67
  sts,
68
  type="pandas",
@@ -70,3 +70,4 @@ with block:
70
  )
71
 
72
  block.launch()
 
 
2
  import pandas as pd
3
 
4
  block = gr.Blocks()
5
+ NUM_DATASETS = 7
6
  NUM_SCORES = 0
7
+ NUM_MODELS = 5
8
+
9
+ def general_dataframe_update():
10
+ """
11
+ Returns general dataframe for general table.
12
+ """
13
+ dataframe = pd.read_csv('data/general.csv')
14
+ return dataframe
15
+
16
+ def classification_dataframe_update():
17
+ """
18
+ Returns classification dataframe for classification table.
19
+ """
20
+ dataframe = pd.read_csv('data/classification.csv')
21
+ return dataframe
22
+ def sts_dataframe_udpate():
23
+ """
24
+ Returns sts dataframe for sts table.
25
+ """
26
+ dataframe = pd.read_csv('data/sts.csv')
27
+ return dataframe
28
 
29
  with block:
30
  gr.Markdown(f"""**Leaderboard de modelos de Embeddings en español
 
44
  - **Idioma:** Español
45
  """)
46
  with gr.Row():
47
+ overall = general_dataframe_update()
 
 
 
 
 
 
 
 
 
48
  data_overall = gr.components.Dataframe(
49
  overall,
50
  type="pandas",
 
53
  with gr.TabItem("Classification"):
54
  with gr.Row():
55
  # Create and display a sample DataFrame
56
+ classification = classification_dataframe_update()
 
 
 
 
 
 
 
57
  data_overall = gr.components.Dataframe(
58
  classification,
59
  type="pandas",
 
62
  with gr.TabItem("STS"):
63
  with gr.Row():
64
  # Create and display a sample DataFrame
65
+ sts = sts_dataframe_udpate()
 
 
 
 
66
  data_overall = gr.components.Dataframe(
67
  sts,
68
  type="pandas",
 
70
  )
71
 
72
  block.launch()
73
+
data/classification.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Model name,MTEB AmazonReviewsClassification (es),MTEB MTOPDomainClassification (es),MTEB MTOPIntentClassification (es),MTEB MassiveIntentClassification (es),MTEB MassiveScenarioClassification (es)
2
+ multilingual-e5-large,42.66,89.95,66.84,64.68,68.85
3
+ bge-small-en-v1.5,32.03,76.93,52.15,48.77,54.42
4
+ multilingual-e5-base,42.47,89.62,60.27,60.51,66.52
5
+ multilingual-e5-small,41.3,87.33,55.87,58.06,63.1
6
+ paraphrase-multilingual-mpnet-base-v2,39.99,86.96,66.59,64.43,70.42
7
+ sentence-t5-large,42.89,80.78,52.07,54.1,59.56
8
+ sentence-t5-xl,45.01,85.32,57.38,57.97,62.52
data/general.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Model name,Model Size (GB),Embedding Dimensions,Average,Classification Average,Clustering Average,STS Average,Retrieval Average
2
+ multilingual-e5-large,,,72.22,66.6,,77.83,
3
+ bge-small-en-v1.5,,,59.73,52.86,,66.6,
4
+ multilingual-e5-base,,,70.7,63.88,,77.53,
5
+ multilingual-e5-small,,,68.64,61.13,,76.15,
6
+ paraphrase-multilingual-mpnet-base-v2,,,69.1,65.68,,72.53,
7
+ sentence-t5-large,,,64.04,57.88,,70.21,
8
+ sentence-t5-xl,,,66.22,61.64,,70.79,
data/sts.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Model name,MTEB STS17 (es-es),MTEB STS22 (es)
2
+ multilingual-e5-large,87.42,68.23
3
+ bge-small-en-v1.5,77.73,55.47
4
+ multilingual-e5-base,87.26,67.79
5
+ multilingual-e5-small,85.27,67.04
6
+ paraphrase-multilingual-mpnet-base-v2,85.14,59.91
7
+ sentence-t5-large,82.74,57.68
8
+ sentence-t5-xl,83.42,58.16