File size: 4,125 Bytes
69d022a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
"""
Usage: python mteb_meta.py path_to_results_folder
Creates evaluation results metadata for the model card.
E.g.
---
tags:
- mteb
model-index:
- name: SGPT-5.8B-weightedmean-msmarco-specb-bitfit
results:
- task:
type: classification
dataset:
type: mteb/banking77
name: MTEB Banking77
config: default
split: test
revision: 44fa15921b4c889113cc5df03dd4901b49161ab7
metrics:
- type: accuracy
value: 84.49350649350649
---
"""
import json
import logging
import os
import sys
from mteb import MTEB
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
results_folder = sys.argv[1].strip("/")
model_name = results_folder.split("/")[-1]
all_results = {}
for file_name in os.listdir(results_folder):
if not file_name.endswith(".json"):
logger.info(f"Skipping non-json {file_name}")
continue
with open(os.path.join(results_folder, file_name), "r", encoding="utf-8") as f:
results = json.load(f)
all_results = {**all_results, **{file_name.replace(".json", ""): results}}
MARKER = "---"
TAGS = "tags:"
MTEB_TAG = "- mteb"
HEADER = "model-index:"
MODEL = f"- name: {model_name}"
RES = " results:"
META_STRING = "\n".join([MARKER, TAGS, MTEB_TAG, HEADER, MODEL, RES])
ONE_TASK = " - task:\n type: {}\n dataset:\n type: {}\n name: {}\n config: {}\n split: {}\n revision: {}\n metrics:"
ONE_METRIC = " - type: {}\n value: {}"
SKIP_KEYS = ["std", "evaluation_time", "main_score", "threshold"]
for ds_name, res_dict in sorted(all_results.items()):
mteb_desc = (
MTEB(tasks=[ds_name.replace("CQADupstackRetrieval", "CQADupstackAndroidRetrieval")])
.tasks[0]
.description
)
hf_hub_name = mteb_desc.get("hf_hub_name", mteb_desc.get("beir_name"))
if "CQADupstack" in ds_name:
hf_hub_name = "BeIR/cqadupstack"
mteb_type = mteb_desc["type"]
revision = res_dict.get("dataset_revision") # Okay if it's None
split = "test"
if ds_name == "MSMARCO":
split = "dev" if "dev" in res_dict else "validation"
if split not in res_dict:
logger.info(f"Skipping {ds_name} as split {split} not present.")
continue
res_dict = res_dict.get(split)
for lang in mteb_desc["eval_langs"]:
mteb_name = f"MTEB {ds_name}"
mteb_name += f" ({lang})" if len(mteb_desc["eval_langs"]) > 1 else ""
# For English there is no language key if it's the only language
test_result_lang = res_dict.get(lang) if len(mteb_desc["eval_langs"]) > 1 else res_dict
# Skip if the language was not found but it has other languages
if test_result_lang is None:
continue
META_STRING += "\n" + ONE_TASK.format(
mteb_type,
hf_hub_name,
mteb_name,
lang if len(mteb_desc["eval_langs"]) > 1 else "default",
split,
revision
)
for (metric, score) in test_result_lang.items():
if not isinstance(score, dict):
score = {metric: score}
for sub_metric, sub_score in score.items():
if any([x in sub_metric for x in SKIP_KEYS]):
continue
META_STRING += "\n" + ONE_METRIC.format(
f"{metric}_{sub_metric}" if metric != sub_metric else metric,
# All MTEB scores are 0-1, multiply them by 100 for 3 reasons:
# 1) It's easier to visually digest (You need two chars less: "0.1" -> "1")
# 2) Others may multiply them by 100, when building on MTEB making it confusing what the range is
# This happend with Text and Code Embeddings paper (OpenAI) vs original BEIR paper
# 3) It's accepted practice (SuperGLUE, GLUE are 0-100)
sub_score * 100,
)
META_STRING += "\n" + MARKER
if os.path.exists("./mteb_metadata.md"):
logger.warning("Overwriting mteb_metadata.md")
with open(f"./mteb_metadata.md", "w") as f:
f.write(META_STRING) |