CrispStrobe commited on
Commit
062b038
·
1 Parent(s): b090cc8

feat: enhance MTEB enrichment with cross-revision aggregation and manual fallbacks for latest models

Browse files
data/benchmarks.json CHANGED
@@ -72946,16 +72946,28 @@
72946
  {
72947
  "hf_id": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
72948
  "name": "paraphrase-multilingual-mpnet-base-v2",
72949
- "mteb_avg": 31.36,
 
72950
  "sources": {
72951
- "mteb_avg": "mteb"
 
 
 
 
 
 
 
 
 
 
 
72952
  }
72953
  },
72954
  {
72955
  "hf_id": "BAAI/bge-m3",
72956
  "name": "bge-m3",
72957
- "mteb_avg": 63.28,
72958
- "mteb_retrieval": 63.28,
72959
  "sources": {
72960
  "mteb_avg": "mteb",
72961
  "mteb_retrieval": "mteb"
@@ -72964,8 +72976,8 @@
72964
  {
72965
  "hf_id": "sentence-transformers/all-MiniLM-L12-v2",
72966
  "name": "all-MiniLM-L12-v2",
72967
- "mteb_avg": 37.82,
72968
- "mteb_retrieval": 50.99,
72969
  "sources": {
72970
  "mteb_avg": "mteb",
72971
  "mteb_retrieval": "mteb"
@@ -72974,11 +72986,38 @@
72974
  {
72975
  "hf_id": "intfloat/e5-mistral-7b-instruct",
72976
  "name": "e5-mistral-7b-instruct",
72977
- "mteb_avg": 70.57,
72978
- "mteb_retrieval": 54.76,
72979
  "sources": {
72980
  "mteb_avg": "mteb",
72981
  "mteb_retrieval": "mteb"
72982
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72983
  }
72984
  ]
 
72946
  {
72947
  "hf_id": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
72948
  "name": "paraphrase-multilingual-mpnet-base-v2",
72949
+ "mteb_avg": 146.35,
72950
+ "mteb_retrieval": 35.31,
72951
  "sources": {
72952
+ "mteb_avg": "mteb",
72953
+ "mteb_retrieval": "mteb"
72954
+ }
72955
+ },
72956
+ {
72957
+ "hf_id": "BAAI/bge-large-en-v1.5",
72958
+ "name": "bge-large-en-v1.5",
72959
+ "mteb_avg": 46.8,
72960
+ "mteb_retrieval": 38.8,
72961
+ "sources": {
72962
+ "mteb_avg": "mteb",
72963
+ "mteb_retrieval": "mteb"
72964
  }
72965
  },
72966
  {
72967
  "hf_id": "BAAI/bge-m3",
72968
  "name": "bge-m3",
72969
+ "mteb_avg": 69.8,
72970
+ "mteb_retrieval": 47.34,
72971
  "sources": {
72972
  "mteb_avg": "mteb",
72973
  "mteb_retrieval": "mteb"
 
72976
  {
72977
  "hf_id": "sentence-transformers/all-MiniLM-L12-v2",
72978
  "name": "all-MiniLM-L12-v2",
72979
+ "mteb_avg": 47.65,
72980
+ "mteb_retrieval": 29.72,
72981
  "sources": {
72982
  "mteb_avg": "mteb",
72983
  "mteb_retrieval": "mteb"
 
72986
  {
72987
  "hf_id": "intfloat/e5-mistral-7b-instruct",
72988
  "name": "e5-mistral-7b-instruct",
72989
+ "mteb_avg": 62.08,
72990
+ "mteb_retrieval": 55.06,
72991
  "sources": {
72992
  "mteb_avg": "mteb",
72993
  "mteb_retrieval": "mteb"
72994
  }
72995
+ },
72996
+ {
72997
+ "hf_id": "BAAI/bge-multilingual-gemma2",
72998
+ "mteb_avg": 70.3,
72999
+ "mteb_retrieval": 67.5,
73000
+ "sources": {
73001
+ "mteb_avg": "manual",
73002
+ "mteb_retrieval": "manual"
73003
+ }
73004
+ },
73005
+ {
73006
+ "hf_id": "Qwen/Qwen3-Embedding-8B",
73007
+ "mteb_avg": 71.2,
73008
+ "mteb_retrieval": 72.1,
73009
+ "sources": {
73010
+ "mteb_avg": "manual",
73011
+ "mteb_retrieval": "manual"
73012
+ }
73013
+ },
73014
+ {
73015
+ "hf_id": "BAAI/bge-en-icl",
73016
+ "mteb_avg": 64.9,
73017
+ "mteb_retrieval": 58.2,
73018
+ "sources": {
73019
+ "mteb_avg": "manual",
73020
+ "mteb_retrieval": "manual"
73021
+ }
73022
  }
73023
  ]
scripts/fetch-benchmarks.js CHANGED
@@ -512,8 +512,18 @@ async function fetchMTEB() {
512
  if (!resultPaths) continue;
513
 
514
  const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))];
515
- const latestPaths = resultPaths.filter(p => p.includes(`/${revisions[revisions.length - 1]}/`));
 
 
 
 
 
 
 
 
 
516
 
 
517
  process.stdout.write(` MTEB: ${hfId} (${latestPaths.length} tasks)\r`);
518
 
519
  let total = 0, count = 0, retTotal = 0, retCount = 0;
@@ -526,16 +536,22 @@ async function fetchMTEB() {
526
  const data = scores.test || scores.dev || scores.validation;
527
  if (!data) return;
528
  const arr = Array.isArray(data) ? data : [data];
529
- arr.forEach(r => {
530
- if (r.languages && !r.languages.some(l => l.startsWith('eng') || l === 'en') && arr.length > 1) return;
531
- const s = r.main_score || r.ndcg_at_10 || r.accuracy;
 
 
 
 
 
 
532
  if (typeof s === 'number' && s > 0) {
533
  const norm = s <= 1.0 ? s * 100 : s;
534
  total += norm; count++;
535
  const task = res.mteb_dataset_name || res.task_name || '';
536
  if (task.includes('Retrieval') || task.includes('Search')) { retTotal += norm; retCount++; }
537
  }
538
- });
539
  });
540
  }
541
  if (count > 0) {
@@ -554,6 +570,17 @@ async function fetchMTEB() {
554
 
555
  function mergeMTEB(entries, mtebEntries) {
556
  const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m]));
 
 
 
 
 
 
 
 
 
 
 
557
  let matched = 0;
558
  for (const e of entries) {
559
  const m = e.hf_id ? map.get(e.hf_id.toLowerCase()) : null;
 
512
  if (!resultPaths) continue;
513
 
514
  const revisions = [...new Set(resultPaths.map(p => p.split('/')[2]))];
515
+ // Aggregation: we'll take all unique tasks across all revisions,
516
+ // prioritizing the latest revision for each task.
517
+ const taskPaths = new Map();
518
+ revisions.forEach(rev => {
519
+ const pathsInRev = resultPaths.filter(p => p.includes(`/${rev}/`));
520
+ pathsInRev.forEach(p => {
521
+ const taskName = p.split('/').pop().replace('.json', '');
522
+ taskPaths.set(taskName, p);
523
+ });
524
+ });
525
 
526
+ const latestPaths = [...taskPaths.values()];
527
  process.stdout.write(` MTEB: ${hfId} (${latestPaths.length} tasks)\r`);
528
 
529
  let total = 0, count = 0, retTotal = 0, retCount = 0;
 
536
  const data = scores.test || scores.dev || scores.validation;
537
  if (!data) return;
538
  const arr = Array.isArray(data) ? data : [data];
539
+
540
+ // Find English or default subset
541
+ let targetRes = arr.find(r => r.languages && r.languages.some(l => l.startsWith('eng') || l === 'en'));
542
+ if (!targetRes && arr.length === 1) targetRes = arr[0];
543
+ if (!targetRes) targetRes = arr.find(r => r.hf_subset === 'default');
544
+ if (!targetRes && arr.length > 0) targetRes = arr[0];
545
+
546
+ if (targetRes) {
547
+ const s = targetRes.main_score || targetRes.ndcg_at_10 || targetRes.accuracy;
548
  if (typeof s === 'number' && s > 0) {
549
  const norm = s <= 1.0 ? s * 100 : s;
550
  total += norm; count++;
551
  const task = res.mteb_dataset_name || res.task_name || '';
552
  if (task.includes('Retrieval') || task.includes('Search')) { retTotal += norm; retCount++; }
553
  }
554
+ }
555
  });
556
  }
557
  if (count > 0) {
 
570
 
571
  function mergeMTEB(entries, mtebEntries) {
572
  const map = new Map(mtebEntries.map(m => [m.hf_id.toLowerCase(), m]));
573
+
574
+ // Manual overrides for famous models not yet in the results repo or needing fixed values
575
+ const overrides = [
576
+ { hf_id: 'BAAI/bge-multilingual-gemma2', mteb_avg: 70.3, mteb_retrieval: 67.5, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
577
+ { hf_id: 'Qwen/Qwen3-Embedding-8B', mteb_avg: 71.2, mteb_retrieval: 72.1, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
578
+ { hf_id: 'BAAI/bge-en-icl', mteb_avg: 64.9, mteb_retrieval: 58.2, sources: { mteb_avg: 'manual', mteb_retrieval: 'manual' } },
579
+ ];
580
+ overrides.forEach(o => {
581
+ if (!map.has(o.hf_id.toLowerCase())) map.set(o.hf_id.toLowerCase(), o);
582
+ });
583
+
584
  let matched = 0;
585
  for (const e of entries) {
586
  const m = e.hf_id ? map.get(e.hf_id.toLowerCase()) : null;
scripts/fetch-providers.js CHANGED
@@ -219,6 +219,9 @@ const MANUAL_HF_ID_MAP = {
219
  'mistral embed': 'mistralai/mistral-embed',
220
  'codestral embed': 'mistralai/mistral-embed',
221
  'e5 mistral 7b instruct': 'intfloat/e5-mistral-7b-instruct',
 
 
 
222
  };
223
 
224
  const MANUAL_OLLAMA_ID_MAP = {
 
219
  'mistral embed': 'mistralai/mistral-embed',
220
  'codestral embed': 'mistralai/mistral-embed',
221
  'e5 mistral 7b instruct': 'intfloat/e5-mistral-7b-instruct',
222
+ 'qwen3-embedding-8b': 'Qwen/Qwen3-Embedding-8B',
223
+ 'bge-multilingual-gemma2': 'BAAI/bge-multilingual-gemma2',
224
+ 'bge-en-icl': 'BAAI/bge-en-icl',
225
  };
226
 
227
  const MANUAL_OLLAMA_ID_MAP = {