ethanLeaderboard / config.yaml
Muennighoff's picture
Fix metric names & metadata new format
6181979
raw
history blame
18.8 kB
config:
REPO_ID: "mteb/leaderboard"
RESULTS_REPO: mteb/results
LEADERBOARD_NAME: "MTEB Leaderboard"
tasks:
BitextMining:
icon: "๐ŸŽŒ"
metric: f1
metric_description: "[F1](https://huggingface.co/spaces/evaluate-metric/f1)"
task_description: "Bitext mining is the task of finding parallel sentences in two languages."
Classification:
icon: "โค๏ธ"
metric: accuracy
metric_description: "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)"
task_description: "Classification is the task of assigning a label to a text."
Clustering:
icon: "โœจ"
metric: v_measure
metric_description: "Validity Measure (V-measure)"
task_description: "Clustering is the task of grouping similar documents together."
PairClassification:
icon: "๐ŸŽญ"
metric: ap
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
task_description: "Pair classification is the task of determining whether two texts are similar."
Reranking:
icon: "๐Ÿฅˆ"
metric: map
metric_description: "Mean Average Precision (MAP)"
task_description: "Reranking is the task of reordering a list of documents to improve relevance."
Retrieval:
icon: "๐Ÿ”Ž"
metric: ndcg_at_10
metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)"
task_description: "Retrieval is the task of finding relevant documents for a query."
STS:
icon: "โ˜˜๏ธ"
metric: spearman
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
Summarization:
icon: "๐Ÿ“œ"
metric: spearman
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
task_description: "Summarization is the task of generating a summary of a text."
InstructionRetrieval:
icon: "๐Ÿ”Ž๐Ÿ“‹"
metric: "p-MRR"
metric_description: "paired mean reciprocal rank (p-MRR)"
task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
boards:
en:
title: English
language_long: "English"
has_overall: true
acronym: null
icon: null
special_icons: null
credits: null
tasks:
Classification:
- AmazonCounterfactualClassification (en)
- AmazonPolarityClassification
- AmazonReviewsClassification (en)
- Banking77Classification
- EmotionClassification
- ImdbClassification
- MassiveIntentClassification (en)
- MassiveScenarioClassification (en)
- MTOPDomainClassification (en)
- MTOPIntentClassification (en)
- ToxicConversationsClassification
- TweetSentimentExtractionClassification
Clustering:
- ArxivClusteringP2P
- ArxivClusteringS2S
- BiorxivClusteringP2P
- BiorxivClusteringS2S
- MedrxivClusteringP2P
- MedrxivClusteringS2S
- RedditClustering
- RedditClusteringP2P
- StackExchangeClustering
- StackExchangeClusteringP2P
- TwentyNewsgroupsClustering
PairClassification:
- SprintDuplicateQuestions
- TwitterSemEval2015
- TwitterURLCorpus
Reranking:
- AskUbuntuDupQuestions
- MindSmallReranking
- SciDocsRR
- StackOverflowDupQuestions
Retrieval:
- ArguAna
- ClimateFEVER
- CQADupstackRetrieval
- DBPedia
- FEVER
- FiQA2018
- HotpotQA
- MSMARCO
- NFCorpus
- NQ
- QuoraRetrieval
- SCIDOCS
- SciFact
- Touche2020
- TRECCOVID
STS:
- BIOSSES
- SICK-R
- STS12
- STS13
- STS14
- STS15
- STS16
- STS17 (en-en)
- STS22 (en)
- STSBenchmark
Summarization:
- SummEval
en-x:
title: "English-X"
language_long: "117 (Pairs of: English & other language)"
has_overall: false
acronym: null
icon: null
special_icons: null
credits: null
tasks:
BitextMining: ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)']
zh:
title: Chinese
language_long: Chinese
has_overall: true
acronym: C-MTEB
icon: "๐Ÿ‡จ๐Ÿ‡ณ"
special_icons:
Classification: "๐Ÿงก"
credits: "[FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)"
tasks:
Classification:
- AmazonReviewsClassification (zh)
- IFlyTek
- JDReview
- MassiveIntentClassification (zh-CN)
- MassiveScenarioClassification (zh-CN)
- MultilingualSentiment
- OnlineShopping
- TNews
- Waimai
Clustering:
- CLSClusteringP2P
- CLSClusteringS2S
- ThuNewsClusteringP2P
- ThuNewsClusteringS2S
PairClassification:
- Cmnli
- Ocnli
Reranking:
- CMedQAv1
- CMedQAv2
- MMarcoReranking
- T2Reranking
Retrieval:
- CmedqaRetrieval
- CovidRetrieval
- DuRetrieval
- EcomRetrieval
- MedicalRetrieval
- MMarcoRetrieval
- T2Retrieval
- VideoRetrieval
STS:
- AFQMC
- ATEC
- BQ
- LCQMC
- PAWSX
- QBQTC
- STS22 (zh)
- STSB
da:
title: Danish
language_long: Danish
has_overall: false
acronym: null
icon: "๐Ÿ‡ฉ๐Ÿ‡ฐ"
special_icons:
Classification: "๐Ÿค"
credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)"
tasks:
BitextMining:
- BornholmBitextMining
Classification:
- AngryTweetsClassification
- DanishPoliticalCommentsClassification
- DKHateClassification
- LccSentimentClassification
- MassiveIntentClassification (da)
- MassiveScenarioClassification (da)
- NordicLangClassification
- ScalaDaClassification
fr:
title: French
language_long: "French"
has_overall: true
acronym: "F-MTEB"
icon: "๐Ÿ‡ซ๐Ÿ‡ท"
special_icons:
Classification: "๐Ÿ’™"
credits: "[Lyon-NLP](https://github.com/Lyon-NLP): [Gabriel Sequeira](https://github.com/GabrielSequeira), [Imene Kerboua](https://github.com/imenelydiaker), [Wissam Siblini](https://github.com/wissam-sib), [Mathieu Ciancone](https://github.com/MathieuCiancone), [Marion Schaeffer](https://github.com/schmarion)"
tasks:
Classification:
- AmazonReviewsClassification (fr)
- MasakhaNEWSClassification (fra)
- MassiveIntentClassification (fr)
- MassiveScenarioClassification (fr)
- MTOPDomainClassification (fr)
- MTOPIntentClassification (fr)
Clustering:
- AlloProfClusteringP2P
- AlloProfClusteringS2S
- HALClusteringS2S
- MLSUMClusteringP2P (fr)
- MLSUMClusteringS2S (fr)
- MasakhaNEWSClusteringP2P (fra)
- MasakhaNEWSClusteringS2S (fra)
PairClassification:
- OpusparcusPC (fr)
- PawsX (fr)
Reranking:
- AlloprofReranking
- SyntecReranking
Retrieval:
- AlloprofRetrieval
- BSARDRetrieval
- MintakaRetrieval (fr)
- SyntecRetrieval
- XPQARetrieval (fr)
STS:
- STS22 (fr)
- STSBenchmarkMultilingualSTS (fr)
- SICKFr
Summarization:
- SummEvalFr
'no':
title: Norwegian
language_long: "Norwegian Bokmรฅl"
has_overall: false
acronym: null
icon: "๐Ÿ‡ณ๐Ÿ‡ด"
special_icons:
Classification: "๐Ÿ’™"
credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)"
tasks:
Classification: &id001
- NoRecClassification
- NordicLangClassification
- NorwegianParliament
- MassiveIntentClassification (nb)
- MassiveScenarioClassification (nb)
- ScalaNbClassification
instructions:
title: English
language_long: "English"
has_overall: false
acronym: null
icon: null
credits: "[Orion Weller, FollowIR](https://arxiv.org/abs/2403.15246)"
tasks:
InstructionRetrieval:
- Robust04InstructionRetrieval
- News21InstructionRetrieval
- Core17InstructionRetrieval
law:
title: Law
language_long: "English, German, Chinese"
has_overall: false
acronym: null
icon: "โš–๏ธ"
special_icons: null
credits: "[Voyage AI](https://www.voyageai.com/)"
tasks:
Retrieval:
- AILACasedocs
- AILAStatutes
- GerDaLIRSmall
- LeCaRDv2
- LegalBenchConsumerContractsQA
- LegalBenchCorporateLobbying
- LegalQuAD
- LegalSummarization
longembed:
title: LongEmbed
language_long: "English"
has_overall: false
acronym: null
icon: "๐Ÿ“š"
special_icons: null
credits: "[LongEmbed](https://arxiv.org/abs/2404.12096v2)"
metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle)
tasks:
Retrieval:
- LEMBNarrativeQARetrieval
- LEMBNeedleRetrieval
- LEMBPasskeyRetrieval
- LEMBQMSumRetrieval
- LEMBSummScreenFDRetrieval
- LEMBWikimQARetrieval
de:
title: German
language_long: "German"
has_overall: false
acronym: null
icon: "๐Ÿ‡ฉ๐Ÿ‡ช"
special_icons: null
credits: "[Silvan](https://github.com/slvnwhrl)"
tasks:
Clustering:
- BlurbsClusteringP2P
- BlurbsClusteringS2S
- TenKGnadClusteringP2P
- TenKGnadClusteringS2S
pl:
title: Polish
language_long: Polish
has_overall: true
acronym: null
icon: "๐Ÿ‡ต๐Ÿ‡ฑ"
special_icons:
Classification: "๐Ÿค"
credits: "[Rafaล‚ Poล›wiata](https://github.com/rafalposwiata)"
tasks:
Classification:
- AllegroReviews
- CBD
- MassiveIntentClassification (pl)
- MassiveScenarioClassification (pl)
- PAC
- PolEmo2.0-IN
- PolEmo2.0-OUT
Clustering:
- 8TagsClustering
PairClassification:
- CDSC-E
- PPC
- PSC
- SICK-E-PL
Retrieval:
- ArguAna-PL
- DBPedia-PL
- FiQA-PL
- HotpotQA-PL
- MSMARCO-PL
- NFCorpus-PL
- NQ-PL
- Quora-PL
- SCIDOCS-PL
- SciFact-PL
- TRECCOVID-PL
STS:
- CDSC-R
- SICK-R-PL
- STS22 (pl)
se:
title: Swedish
language_long: Swedish
has_overall: false
acronym: null
icon: "๐Ÿ‡ธ๐Ÿ‡ช"
special_icons:
Classification: "๐Ÿ’›"
credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)"
tasks:
Classification:
- NoRecClassification
- NordicLangClassification
- NorwegianParliament
- MassiveIntentClassification (nb)
- MassiveScenarioClassification (nb)
- ScalaNbClassification
other-cls:
title: "Other Languages"
language_long: "47 (Only languages not included in the other tabs)"
has_overall: false
acronym: null
icon: null
special_icons:
Classification: "๐Ÿ’œ๐Ÿ’š๐Ÿ’™"
credits: null
tasks:
Classification: ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)']
other-sts:
title: Other
language_long: "Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)"
has_overall: false
acronym: null
icon: null
special_icons: null
credits: null
tasks:
STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"]