Muennighoff commited on
Commit
767d579
1 Parent(s): f8ed0b8

Multiple LEMB metrics & fix legacy french naming

Browse files
Files changed (4) hide show
  1. EXTERNAL_MODEL_RESULTS.json +0 -0
  2. app.py +38 -17
  3. config.yaml +2 -2
  4. model_meta.yaml +8 -0
EXTERNAL_MODEL_RESULTS.json CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from functools import partial, reduce
2
  import json
3
  import os
4
  import re
@@ -23,7 +23,7 @@ PRETTY_NAMES = {
23
  "BitextMining": "Bitext Mining",
24
  }
25
 
26
- TASK_TO_METRIC = {k:v["metric"] for k,v in TASKS_CONFIG.items()}
27
 
28
  def make_clickable_model(model_name, link=None):
29
  if link is None:
@@ -93,6 +93,17 @@ def add_task(examples):
93
  examples["mteb_task"] = "Unknown"
94
  return examples
95
 
 
 
 
 
 
 
 
 
 
 
 
96
  if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
97
  with open("EXTERNAL_MODEL_RESULTS.json") as f:
98
  EXTERNAL_MODEL_RESULTS = json.load(f)
@@ -115,17 +126,9 @@ for model in pbar:
115
  ds = ds.map(add_lang)
116
  ds = ds.map(add_task)
117
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
118
- # For now only one metric per task - Could add more metrics lateron
119
-
120
- def filter_function(x, task, metric):
121
- # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
122
- if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
123
- return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
124
- else:
125
- return x["mteb_task"] == task and x["metric"] == metric
126
-
127
  for task, metric in TASK_TO_METRIC.items():
128
- ds_dict = ds.filter(lambda x: filter_function(x, task, metric))["test"].to_dict()
129
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
130
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
131
 
@@ -190,6 +193,11 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
190
  global MODEL_INFOS
191
  api = API
192
  models = api.list_models(filter="mteb")
 
 
 
 
 
193
  # Initialize list to models that we cannot fetch metadata from
194
  df_list = []
195
  for model in EXTERNAL_MODEL_RESULTS:
@@ -253,7 +261,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
253
  # if model.modelId == "w601sxs/b1ade-embed-kd_3":
254
  # import pdb; pdb.set_trace()
255
  try:
256
- out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
257
  except:
258
  print("ERROR", model.modelId)
259
  continue
@@ -281,7 +289,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
281
  df_list.append(out)
282
  if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
283
  SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
284
-
285
  # Save & cache MODEL_INFOS
286
  with open("model_infos.json", "w") as f:
287
  json.dump(MODEL_INFOS, f)
@@ -294,7 +302,18 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
294
  cols = sorted(list(df.columns))
295
  base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
296
  if len(datasets) > 0:
297
- #filter invalid columns
 
 
 
 
 
 
 
 
 
 
 
298
  cols = [col for col in cols if col in base_columns + datasets]
299
  i = 0
300
  for column in base_columns:
@@ -447,6 +466,7 @@ for board, board_config in BOARDS_CONFIG.items():
447
  if board_icon is None:
448
  board_icon = ""
449
  credits = board_config.get("credits", None)
 
450
 
451
  if board_config["has_overall"]:
452
  overall_pretty_name = board_pretty_name
@@ -459,6 +479,7 @@ for board, board_config in BOARDS_CONFIG.items():
459
  "data": boards_data[board]["data_overall"],
460
  "refresh": get_refresh_overall_function(board_config["tasks"]),
461
  "credits": credits,
 
462
  })
463
  for task_category, task_category_list in board_config["tasks"].items():
464
  task_icon = TASKS_CONFIG[task_category]['icon']
@@ -471,7 +492,7 @@ for board, board_config in BOARDS_CONFIG.items():
471
  "data": boards_data[board]["data_tasks"][task_category],
472
  "refresh": get_refresh_function(task_category, task_category_list),
473
  "credits": credits,
474
- "metric": board_config.get("metric", None),
475
  })
476
 
477
  dataframes = []
@@ -635,7 +656,7 @@ with gr.Blocks(css=css) as block:
635
  gr.Markdown(f"""
636
  {item['description']}
637
 
638
- - **Metric:** {specific_metric}
639
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
640
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
641
  """)
 
1
+ from functools import reduce
2
  import json
3
  import os
4
  import re
 
23
  "BitextMining": "Bitext Mining",
24
  }
25
 
26
+ TASK_TO_METRIC = {k: v["metric"] for k, v in TASKS_CONFIG.items()}
27
 
28
  def make_clickable_model(model_name, link=None):
29
  if link is None:
 
93
  examples["mteb_task"] = "Unknown"
94
  return examples
95
 
96
+ def filter_metric_external(x, task, metric):
97
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
98
+ if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
99
+ return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
100
+ else:
101
+ return x["mteb_task"] == task and x["metric"] == metric
102
+
103
+ def filter_metric_fetched(name, metric, expected_metric):
104
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
105
+ return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric == expected_metric
106
+
107
  if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
108
  with open("EXTERNAL_MODEL_RESULTS.json") as f:
109
  EXTERNAL_MODEL_RESULTS = json.load(f)
 
126
  ds = ds.map(add_lang)
127
  ds = ds.map(add_task)
128
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
129
+
 
 
 
 
 
 
 
 
130
  for task, metric in TASK_TO_METRIC.items():
131
+ ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metric))["test"].to_dict()
132
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
133
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
134
 
 
193
  global MODEL_INFOS
194
  api = API
195
  models = api.list_models(filter="mteb")
196
+ # Legacy names changes; Also fetch the old results & merge later
197
+ if ('MLSUMClusteringP2P (fr)' in datasets):
198
+ datasets.append('MLSUMClusteringP2P')
199
+ if ('MLSUMClusteringS2S (fr)' in datasets):
200
+ datasets.append('MLSUMClusteringS2S')
201
  # Initialize list to models that we cannot fetch metadata from
202
  df_list = []
203
  for model in EXTERNAL_MODEL_RESULTS:
 
261
  # if model.modelId == "w601sxs/b1ade-embed-kd_3":
262
  # import pdb; pdb.set_trace()
263
  try:
264
+ out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
265
  except:
266
  print("ERROR", model.modelId)
267
  continue
 
289
  df_list.append(out)
290
  if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
291
  SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
292
+
293
  # Save & cache MODEL_INFOS
294
  with open("model_infos.json", "w") as f:
295
  json.dump(MODEL_INFOS, f)
 
302
  cols = sorted(list(df.columns))
303
  base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
304
  if len(datasets) > 0:
305
+ # Update legacy column names to be merged with newer ones
306
+ # Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
307
+ #if ('MLSUMClusteringP2P (fr)' in datasets):
308
+ # import pdb; pdb.set_trace()
309
+ if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
310
+ #import pdb; pdb.set_trace()
311
+ df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
312
+ datasets.remove('MLSUMClusteringP2P')
313
+ if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
314
+ df['MLSUMClusteringS2S (fr)'] = df['MLSUMClusteringS2S (fr)'].fillna(df['MLSUMClusteringS2S'])
315
+ datasets.remove('MLSUMClusteringS2S')
316
+ # Filter invalid columns
317
  cols = [col for col in cols if col in base_columns + datasets]
318
  i = 0
319
  for column in base_columns:
 
466
  if board_icon is None:
467
  board_icon = ""
468
  credits = board_config.get("credits", None)
469
+ metric = board_config.get("metric", None)
470
 
471
  if board_config["has_overall"]:
472
  overall_pretty_name = board_pretty_name
 
479
  "data": boards_data[board]["data_overall"],
480
  "refresh": get_refresh_overall_function(board_config["tasks"]),
481
  "credits": credits,
482
+ "metric": metric,
483
  })
484
  for task_category, task_category_list in board_config["tasks"].items():
485
  task_icon = TASKS_CONFIG[task_category]['icon']
 
492
  "data": boards_data[board]["data_tasks"][task_category],
493
  "refresh": get_refresh_function(task_category, task_category_list),
494
  "credits": credits,
495
+ "metric": metric,
496
  })
497
 
498
  dataframes = []
 
656
  gr.Markdown(f"""
657
  {item['description']}
658
 
659
+ - **Metric:** {item.get('metric', metric)}
660
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
661
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
662
  """)
config.yaml CHANGED
@@ -224,8 +224,8 @@ boards:
224
  - AlloProfClusteringP2P
225
  - AlloProfClusteringS2S
226
  - HALClusteringS2S
227
- - MLSUMClusteringP2P
228
- - MLSUMClusteringS2S
229
  - MasakhaNEWSClusteringP2P (fra)
230
  - MasakhaNEWSClusteringS2S (fra)
231
  PairClassification:
 
224
  - AlloProfClusteringP2P
225
  - AlloProfClusteringS2S
226
  - HALClusteringS2S
227
+ - MLSUMClusteringP2P (fr)
228
+ - MLSUMClusteringS2S (fr)
229
  - MasakhaNEWSClusteringP2P (fra)
230
  - MasakhaNEWSClusteringS2S (fra)
231
  PairClassification:
model_meta.yaml CHANGED
@@ -1195,6 +1195,14 @@ model_meta:
1195
  is_external: true
1196
  is_proprietary: true
1197
  is_sentence_transformers_compatible: false
 
 
 
 
 
 
 
 
1198
  xlm-roberta-base:
1199
  link: https://huggingface.co/xlm-roberta-base
1200
  seq_len: 514
 
1195
  is_external: true
1196
  is_proprietary: true
1197
  is_sentence_transformers_compatible: false
1198
+ voyage-multilingual-2:
1199
+ link: https://docs.voyageai.com/embeddings/
1200
+ seq_len: 32000
1201
+ size: null
1202
+ dim: 1024
1203
+ is_external: true
1204
+ is_proprietary: true
1205
+ is_sentence_transformers_compatible: false
1206
  xlm-roberta-base:
1207
  link: https://huggingface.co/xlm-roberta-base
1208
  seq_len: 514