Spaces:
Runtime error
Runtime error
Santi Diana
commited on
Commit
•
d38c074
1
Parent(s):
c4a71e3
Automatic Evaluation Available. Read the README filegit add add_new_model/README.md add_new_model/add_new_model.py app.py add_new_model/execute_evaluation.py
Browse files- add_new_model/README.md +12 -8
- add_new_model/add_new_model.py +130 -3
- add_new_model/execute_evaluation.py +39 -0
- app.py +11 -0
add_new_model/README.md
CHANGED
@@ -2,11 +2,15 @@
|
|
2 |
|
3 |
Here we are going to explain how to add a new model to the Leaderboard. The next steps must be followed:
|
4 |
1. `Git clone` this repository and `cd add_new_model`.
|
5 |
-
2.
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
Here we are going to explain how to add a new model to the Leaderboard. The next steps must be followed:
|
4 |
1. `Git clone` this repository and `cd add_new_model`.
|
5 |
+
2. If you want to evaluate the model and add it to the Leaderboard, you just need to execute the following command:
|
6 |
+
`python3 add_new_model.py --model_id <HF_model> --execute_eval True`
|
7 |
+
This command will execute the evaluation in your runtime. That will create 2 elements: a results folder in which will store all the information regarding the evaluation process and a file called `mteb_metadata.yaml` that contains the processed metadata regarding your evaluation process.
|
8 |
+
That metadata is read and re-ordered and added to the three CSV files that you can find in the `data` folder of this repository. The `app.py` file will read those CSV files, so you must be very careful with them.
|
9 |
+
3. If you do not want to evaluate the model because you already have done it, you just need to execute the following command:
|
10 |
+
`python3 add_new_model.py --output_folder <results_folder>`. That folder is where you have your JSON files containing the previous evaluation you did using MTEB library. If you select any different subfolder, then the code will raise an error (i.e if you select a parent folder of the actual results folder). This methodology will run very fast.
|
11 |
+
4. After you added your model to the Leaderboard, remember you must execute two more commands:
|
12 |
+
`rm -r <results_folder>`
|
13 |
+
`rm mteb_metadata.yaml`
|
14 |
+
We do not need that, so we won't accept any PR that contains those files.
|
15 |
+
5. Once the previous steps are done, you can execute `python3 app.py` from the parent folder of this repository and you will be able to see the Leaderboard before pushing it to the hub.
|
16 |
+
6. Add, commit and `git push` the changes. Remember not to push your results.
|
add_new_model/add_new_model.py
CHANGED
@@ -1,9 +1,15 @@
|
|
1 |
-
|
2 |
import pandas as pd
|
3 |
import yaml
|
4 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
def add_model(
|
7 |
"""
|
8 |
Esto actualiza el archivo del cual app.py coge la información para crear la leaderboard. Entonces, cuando
|
9 |
alguien quiera añadir un nuevo modelo, tiene que ejecutar este archivo.
|
@@ -88,5 +94,126 @@ def add_model(metadata_archive):
|
|
88 |
general_dataframe = pd.concat([general_dataframe, new_row_df], ignore_index=True)
|
89 |
general_dataframe.to_csv("../data/general.csv",index=False)
|
90 |
|
91 |
-
add_model('mteb_metadata.yaml')
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import yaml
|
3 |
import numpy as np
|
4 |
+
import argparse
|
5 |
+
from execute_evaluation import evaluate
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import sys
|
10 |
+
from mteb import MTEB
|
11 |
|
12 |
+
def add_model():
|
13 |
"""
|
14 |
Esto actualiza el archivo del cual app.py coge la información para crear la leaderboard. Entonces, cuando
|
15 |
alguien quiera añadir un nuevo modelo, tiene que ejecutar este archivo.
|
|
|
94 |
general_dataframe = pd.concat([general_dataframe, new_row_df], ignore_index=True)
|
95 |
general_dataframe.to_csv("../data/general.csv",index=False)
|
96 |
|
|
|
97 |
|
98 |
+
def results_to_yaml(results_folder):
|
99 |
+
|
100 |
+
logging.basicConfig(level=logging.INFO)
|
101 |
+
logger = logging.getLogger(__name__)
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
model_name = results_folder.split("/")[-1]
|
106 |
+
|
107 |
+
all_results = {}
|
108 |
+
|
109 |
+
for file_name in os.listdir(results_folder):
|
110 |
+
if not file_name.endswith(".json"):
|
111 |
+
logger.info(f"Skipping non-json {file_name}")
|
112 |
+
raise ValueError("This is not the proper folder. It does not contain the corresponding Json files.")
|
113 |
+
continue
|
114 |
+
with open(os.path.join(results_folder, file_name), "r", encoding="utf-8") as f:
|
115 |
+
results = json.load(f)
|
116 |
+
all_results = {**all_results, **{file_name.replace(".json", ""): results}}
|
117 |
+
|
118 |
+
# Use "train" split instead
|
119 |
+
TRAIN_SPLIT = ["DanishPoliticalCommentsClassification"]
|
120 |
+
# Use "validation" split instead
|
121 |
+
VALIDATION_SPLIT = ["AFQMC", "Cmnli", "IFlyTek", "TNews", "MSMARCO", "MultilingualSentiment", "Ocnli"]
|
122 |
+
# Use "dev" split instead
|
123 |
+
DEV_SPLIT = ["CmedqaRetrieval", "CovidRetrieval", "DuRetrieval", "EcomRetrieval", "MedicalRetrieval", "MMarcoReranking", "MMarcoRetrieval", "MSMARCO", "T2Reranking", "T2Retrieval", "VideoRetrieval"]
|
124 |
+
|
125 |
+
MARKER = "---"
|
126 |
+
TAGS = "tags:"
|
127 |
+
MTEB_TAG = "- mteb"
|
128 |
+
HEADER = "model-index:"
|
129 |
+
MODEL = f"- name: {model_name}"
|
130 |
+
RES = " results:"
|
131 |
+
|
132 |
+
META_STRING = "\n".join([MARKER, TAGS, MTEB_TAG, HEADER, MODEL, RES])
|
133 |
+
|
134 |
+
|
135 |
+
ONE_TASK = " - task:\n type: {}\n dataset:\n type: {}\n name: {}\n config: {}\n split: {}\n revision: {}\n metrics:"
|
136 |
+
ONE_METRIC = " - type: {}\n value: {}"
|
137 |
+
SKIP_KEYS = ["std", "evaluation_time", "main_score", "threshold"]
|
138 |
+
|
139 |
+
for ds_name, res_dict in sorted(all_results.items()):
|
140 |
+
mteb_desc = (
|
141 |
+
MTEB(tasks=[ds_name.replace("CQADupstackRetrieval", "CQADupstackAndroidRetrieval")]).tasks[0].description
|
142 |
+
)
|
143 |
+
hf_hub_name = mteb_desc.get("hf_hub_name", mteb_desc.get("beir_name"))
|
144 |
+
if "CQADupstack" in ds_name:
|
145 |
+
hf_hub_name = "BeIR/cqadupstack"
|
146 |
+
mteb_type = mteb_desc["type"]
|
147 |
+
revision = res_dict.get("dataset_revision") # Okay if it's None
|
148 |
+
split = "test"
|
149 |
+
if (ds_name in TRAIN_SPLIT) and ("train" in res_dict):
|
150 |
+
split = "train"
|
151 |
+
elif (ds_name in VALIDATION_SPLIT) and ("validation" in res_dict):
|
152 |
+
split = "validation"
|
153 |
+
elif (ds_name in DEV_SPLIT) and ("dev" in res_dict):
|
154 |
+
split = "dev"
|
155 |
+
elif "test" not in res_dict:
|
156 |
+
logger.info(f"Skipping {ds_name} as split {split} not present.")
|
157 |
+
continue
|
158 |
+
res_dict = res_dict.get(split)
|
159 |
+
for lang in mteb_desc["eval_langs"]:
|
160 |
+
mteb_name = f"MTEB {ds_name}"
|
161 |
+
mteb_name += f" ({lang})" if len(mteb_desc["eval_langs"]) > 1 else ""
|
162 |
+
# For English there is no language key if it's the only language
|
163 |
+
test_result_lang = res_dict.get(lang) if len(mteb_desc["eval_langs"]) > 1 else res_dict
|
164 |
+
# Skip if the language was not found but it has other languages
|
165 |
+
if test_result_lang is None:
|
166 |
+
continue
|
167 |
+
META_STRING += "\n" + ONE_TASK.format(
|
168 |
+
mteb_type, hf_hub_name, mteb_name, lang if len(mteb_desc["eval_langs"]) > 1 else "default", split, revision
|
169 |
+
)
|
170 |
+
for metric, score in test_result_lang.items():
|
171 |
+
if not isinstance(score, dict):
|
172 |
+
score = {metric: score}
|
173 |
+
for sub_metric, sub_score in score.items():
|
174 |
+
if any([x in sub_metric for x in SKIP_KEYS]):
|
175 |
+
continue
|
176 |
+
META_STRING += "\n" + ONE_METRIC.format(
|
177 |
+
f"{metric}_{sub_metric}" if metric != sub_metric else metric,
|
178 |
+
# All MTEB scores are 0-1, multiply them by 100 for 3 reasons:
|
179 |
+
# 1) It's easier to visually digest (You need two chars less: "0.1" -> "1")
|
180 |
+
# 2) Others may multiply them by 100, when building on MTEB making it confusing what the range is
|
181 |
+
# This happend with Text and Code Embeddings paper (OpenAI) vs original BEIR paper
|
182 |
+
# 3) It's accepted practice (SuperGLUE, GLUE are 0-100)
|
183 |
+
sub_score * 100,
|
184 |
+
)
|
185 |
+
|
186 |
+
META_STRING += "\n" + MARKER
|
187 |
+
if os.path.exists(f"./mteb_metadata.yaml"):
|
188 |
+
logger.warning("Overwriting mteb_metadata.md")
|
189 |
+
with open(f"./mteb_metadata.yaml", "w") as f:
|
190 |
+
f.write(META_STRING)
|
191 |
+
|
192 |
+
|
193 |
+
def main():
|
194 |
+
|
195 |
+
if args.execute_eval:
|
196 |
+
output_folder = evaluate(args.model_id)
|
197 |
+
results_to_yaml(output_folder)
|
198 |
+
add_model()
|
199 |
+
else:
|
200 |
+
print('Hola')
|
201 |
+
print(args.output_folder)
|
202 |
+
if args.output_folder == None:
|
203 |
+
raise ValueError("You must indicate where your results are located")
|
204 |
+
else:
|
205 |
+
results_to_yaml(args.output_folder)
|
206 |
+
add_model()
|
207 |
+
|
208 |
+
|
209 |
+
if __name__ == "__main__":
|
210 |
+
|
211 |
+
parser = argparse.ArgumentParser(description="Select the model that you want to add to the Leaderboard.")
|
212 |
+
parser.add_argument("--model_id", type=str, required=True, help="HuggingFace model path that you want to evaluate.")
|
213 |
+
parser.add_argument("--execute_eval",type=bool, default=False, help="Select if you want to execute evaluation.")
|
214 |
+
parser.add_argument("--output_folder", type=str, help = "Select the folder in which the results are stored.")
|
215 |
+
args = parser.parse_args()
|
216 |
+
main()
|
217 |
+
|
218 |
+
|
219 |
+
|
add_new_model/execute_evaluation.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import huggingface_hub
|
2 |
+
import os
|
3 |
+
import argparse
|
4 |
+
import logging
|
5 |
+
import torch
|
6 |
+
from mteb import MTEB
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
from huggingface_hub import login
|
9 |
+
|
10 |
+
try:
|
11 |
+
huggingface_hub.login(
|
12 |
+
token=os.environ["HUGGINGFACE_TOKEN"], write_permission=True
|
13 |
+
)
|
14 |
+
except Exception as e:
|
15 |
+
print(f"Original error: {e}")
|
16 |
+
|
17 |
+
def get_device():
|
18 |
+
return "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
+
|
20 |
+
def load_model(model_id, device):
|
21 |
+
model = SentenceTransformer(model_id).to(device)
|
22 |
+
logging.info(f"Loaded model {model_id} to the device {device}")
|
23 |
+
return model
|
24 |
+
|
25 |
+
def run_evaluation(model, model_id,output_folder):
|
26 |
+
evaluation = MTEB(task_langs=["es"])
|
27 |
+
evaluation.run(model, output_folder=output_folder, eval_splits=["test"])
|
28 |
+
|
29 |
+
def evaluate(model_id):
|
30 |
+
logging.basicConfig(level=logging.INFO)
|
31 |
+
output_folder = f'results/{model_id}'
|
32 |
+
device = get_device()
|
33 |
+
model = load_model(model_id, device)
|
34 |
+
run_evaluation(model, model_id,output_folder)
|
35 |
+
|
36 |
+
return output_folder
|
37 |
+
|
38 |
+
|
39 |
+
|
app.py
CHANGED
@@ -34,6 +34,17 @@ def clustering_dataframe_update():
|
|
34 |
def retrieval_dataframe_update():
|
35 |
pass
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
block = gr.Blocks()
|
38 |
with block:
|
39 |
gr.Markdown(f"""**Leaderboard de modelos de Embeddings en español
|
|
|
34 |
def retrieval_dataframe_update():
|
35 |
pass
|
36 |
|
37 |
+
def make_clickable_model(link):
|
38 |
+
"""
|
39 |
+
Load json from models. Este update lo tengo que hacer antes de pasarle el df al gradio.
|
40 |
+
"""
|
41 |
+
model_display_name = link.split("/")[-1]
|
42 |
+
# Remove user from model name
|
43 |
+
return (
|
44 |
+
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_display_name.split("/")[-1]}</a>'
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
block = gr.Blocks()
|
49 |
with block:
|
50 |
gr.Markdown(f"""**Leaderboard de modelos de Embeddings en español
|