Muennighoff commited on
Commit
bb5f655
·
1 Parent(s): e8ba190

Add BRIGHT

Browse files
EXTERNAL_MODEL_RESULTS.json CHANGED
The diff for this file is too large to render. See raw diff
 
all_data_tasks/0/default.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,18 +1,12 @@
1
  from functools import reduce
2
- import json
3
- import pickle
4
- import os
5
  import re
6
 
7
  import gradio as gr
8
  import pandas as pd
9
- from tqdm.autonotebook import tqdm
10
 
11
- from utils.model_size import get_model_parameters_memory
12
- from refresh import TASK_TO_METRIC, TASKS, PRETTY_NAMES, TASKS_CONFIG, BOARDS_CONFIG, load_results
13
  from envs import REPO_ID
14
- from refresh import PROPRIETARY_MODELS, SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS, CROSS_ENCODERS, BI_ENCODERS, TASK_DESCRIPTIONS, EXTERNAL_MODEL_TO_LINK, make_clickable_model
15
-
16
 
17
 
18
  PROPRIETARY_MODELS = {
@@ -33,7 +27,6 @@ BI_ENCODERS = {
33
  }
34
 
35
 
36
-
37
  def make_datasets_clickable(df):
38
  """Does not work"""
39
  if "BornholmBitextMining" in df.columns:
@@ -43,7 +36,6 @@ def make_datasets_clickable(df):
43
  return df
44
 
45
 
46
-
47
  # 1. Force headers to wrap
48
  # 2. Force model column (maximum) width
49
  # 3. Prevent model column from overflowing, scroll instead
 
1
  from functools import reduce
 
 
 
2
  import re
3
 
4
  import gradio as gr
5
  import pandas as pd
 
6
 
 
 
7
  from envs import REPO_ID
8
+ from refresh import BOARDS_CONFIG, TASKS, TASKS_CONFIG, TASK_DESCRIPTIONS, PRETTY_NAMES, load_results, make_clickable_model
9
+ from refresh import PROPRIETARY_MODELS, SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS, CROSS_ENCODERS, BI_ENCODERS, EXTERNAL_MODEL_TO_LINK
10
 
11
 
12
  PROPRIETARY_MODELS = {
 
27
  }
28
 
29
 
 
30
  def make_datasets_clickable(df):
31
  """Does not work"""
32
  if "BornholmBitextMining" in df.columns:
 
36
  return df
37
 
38
 
 
39
  # 1. Force headers to wrap
40
  # 2. Force model column (maximum) width
41
  # 3. Prevent model column from overflowing, scroll instead
boards_data/bright/data_overall/default.txt ADDED
File without changes
boards_data/bright/data_tasks/Retrieval/default.jsonl ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"index":4,"Rank":1,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/Alibaba-NLP\/gte-Qwen2-7B-instruct\">gte-Qwen2-7B-instruct<\/a>","Model Size (Million Parameters)":7613,"Memory Usage (GB, fp32)":"28.36","Average":22.38,"BrightRetrieval (aops)":15.1,"BrightRetrieval (biology)":32.09,"BrightRetrieval (earth_science)":40.66,"BrightRetrieval (economics)":16.18,"BrightRetrieval (leetcode)":31.07,"BrightRetrieval (pony)":1.25,"BrightRetrieval (psychology)":26.58,"BrightRetrieval (robotics)":12.82,"BrightRetrieval (stackoverflow)":13.95,"BrightRetrieval (sustainable_living)":20.82,"BrightRetrieval (theoremqa_questions)":29.9,"BrightRetrieval (theoremqa_theorems)":28.15}
2
+ {"index":3,"Rank":2,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/Alibaba-NLP\/gte-Qwen1.5-7B-instruct\">gte-Qwen1.5-7B-instruct<\/a>","Model Size (Million Parameters)":7099,"Memory Usage (GB, fp32)":"26.45","Average":21.75,"BrightRetrieval (aops)":14.36,"BrightRetrieval (biology)":30.92,"BrightRetrieval (earth_science)":36.22,"BrightRetrieval (economics)":17.72,"BrightRetrieval (leetcode)":25.46,"BrightRetrieval (pony)":9.79,"BrightRetrieval (psychology)":24.61,"BrightRetrieval (robotics)":13.47,"BrightRetrieval (stackoverflow)":19.85,"BrightRetrieval (sustainable_living)":14.93,"BrightRetrieval (theoremqa_questions)":26.97,"BrightRetrieval (theoremqa_theorems)":26.66}
3
+ {"index":7,"Rank":3,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/GritLM\/GritLM-7B\">GritLM-7B<\/a>","Model Size (Million Parameters)":7240,"Memory Usage (GB, fp32)":"26.97","Average":20.43,"BrightRetrieval (aops)":8.91,"BrightRetrieval (biology)":25.04,"BrightRetrieval (earth_science)":32.77,"BrightRetrieval (economics)":19.0,"BrightRetrieval (leetcode)":29.85,"BrightRetrieval (pony)":21.98,"BrightRetrieval (psychology)":19.92,"BrightRetrieval (robotics)":17.31,"BrightRetrieval (stackoverflow)":11.62,"BrightRetrieval (sustainable_living)":18.04,"BrightRetrieval (theoremqa_questions)":23.34,"BrightRetrieval (theoremqa_theorems)":17.41}
4
+ {"index":0,"Rank":4,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/cloud.google.com\/vertex-ai\/generative-ai\/docs\/embeddings\/get-text-embeddings#latest_models\">google-gecko.text-embedding-preview-0409<\/a>","Model Size (Million Parameters)":1200,"Memory Usage (GB, fp32)":"4.47","Average":19.73,"BrightRetrieval (aops)":9.33,"BrightRetrieval (biology)":22.98,"BrightRetrieval (earth_science)":34.38,"BrightRetrieval (economics)":19.5,"BrightRetrieval (leetcode)":29.64,"BrightRetrieval (pony)":3.59,"BrightRetrieval (psychology)":27.86,"BrightRetrieval (robotics)":15.98,"BrightRetrieval (stackoverflow)":17.93,"BrightRetrieval (sustainable_living)":17.25,"BrightRetrieval (theoremqa_questions)":21.51,"BrightRetrieval (theoremqa_theorems)":16.77}
5
+ {"index":10,"Rank":5,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/hkunlp\/instructor-xl\">instructor-xl<\/a>","Model Size (Million Parameters)":1241,"Memory Usage (GB, fp32)":"4.62","Average":18.64,"BrightRetrieval (aops)":8.26,"BrightRetrieval (biology)":21.91,"BrightRetrieval (earth_science)":34.35,"BrightRetrieval (economics)":22.81,"BrightRetrieval (leetcode)":27.5,"BrightRetrieval (pony)":5.02,"BrightRetrieval (psychology)":27.43,"BrightRetrieval (robotics)":17.39,"BrightRetrieval (stackoverflow)":19.06,"BrightRetrieval (sustainable_living)":18.82,"BrightRetrieval (theoremqa_questions)":14.59,"BrightRetrieval (theoremqa_theorems)":6.5}
6
+ {"index":8,"Rank":6,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/Salesforce\/SFR-Embedding-Mistral\">SFR-Embedding-Mistral<\/a>","Model Size (Million Parameters)":7111,"Memory Usage (GB, fp32)":"26.49","Average":18.0,"BrightRetrieval (aops)":7.43,"BrightRetrieval (biology)":19.49,"BrightRetrieval (earth_science)":26.63,"BrightRetrieval (economics)":17.84,"BrightRetrieval (leetcode)":27.35,"BrightRetrieval (pony)":1.97,"BrightRetrieval (psychology)":18.97,"BrightRetrieval (robotics)":16.7,"BrightRetrieval (stackoverflow)":12.72,"BrightRetrieval (sustainable_living)":19.79,"BrightRetrieval (theoremqa_questions)":23.05,"BrightRetrieval (theoremqa_theorems)":24.05}
7
+ {"index":1,"Rank":7,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/docs.voyageai.com\/embeddings\/\">voyage-large-2-instruct<\/a>","Model Size (Million Parameters)":"","Memory Usage (GB, fp32)":"","Average":17.57,"BrightRetrieval (aops)":7.45,"BrightRetrieval (biology)":23.55,"BrightRetrieval (earth_science)":25.09,"BrightRetrieval (economics)":19.85,"BrightRetrieval (leetcode)":30.6,"BrightRetrieval (pony)":1.48,"BrightRetrieval (psychology)":24.79,"BrightRetrieval (robotics)":11.21,"BrightRetrieval (stackoverflow)":15.03,"BrightRetrieval (sustainable_living)":15.58,"BrightRetrieval (theoremqa_questions)":26.06,"BrightRetrieval (theoremqa_theorems)":10.13}
8
+ {"index":13,"Rank":8,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/openai.com\/blog\/new-embedding-models-and-api-updates\">text-embedding-3-large<\/a>","Model Size (Million Parameters)":"","Memory Usage (GB, fp32)":"","Average":17.43,"BrightRetrieval (aops)":8.45,"BrightRetrieval (biology)":23.67,"BrightRetrieval (earth_science)":26.27,"BrightRetrieval (economics)":19.98,"BrightRetrieval (leetcode)":23.65,"BrightRetrieval (pony)":2.45,"BrightRetrieval (psychology)":27.52,"BrightRetrieval (robotics)":12.93,"BrightRetrieval (stackoverflow)":12.49,"BrightRetrieval (sustainable_living)":20.32,"BrightRetrieval (theoremqa_questions)":22.22,"BrightRetrieval (theoremqa_theorems)":9.25}
9
+ {"index":11,"Rank":9,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/intfloat\/e5-mistral-7b-instruct\">e5-mistral-7b-instruct<\/a>","Model Size (Million Parameters)":7111,"Memory Usage (GB, fp32)":"26.49","Average":17.43,"BrightRetrieval (aops)":7.1,"BrightRetrieval (biology)":18.84,"BrightRetrieval (earth_science)":25.96,"BrightRetrieval (economics)":15.49,"BrightRetrieval (leetcode)":28.72,"BrightRetrieval (pony)":4.81,"BrightRetrieval (psychology)":15.79,"BrightRetrieval (robotics)":16.37,"BrightRetrieval (stackoverflow)":9.83,"BrightRetrieval (sustainable_living)":18.51,"BrightRetrieval (theoremqa_questions)":23.94,"BrightRetrieval (theoremqa_theorems)":23.78}
10
+ {"index":6,"Rank":10,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/Cohere\/Cohere-embed-english-v3.0\">Cohere-embed-english-v3.0<\/a>","Model Size (Million Parameters)":"","Memory Usage (GB, fp32)":"","Average":16.24,"BrightRetrieval (aops)":6.46,"BrightRetrieval (biology)":18.98,"BrightRetrieval (earth_science)":27.45,"BrightRetrieval (economics)":20.18,"BrightRetrieval (leetcode)":26.78,"BrightRetrieval (pony)":1.77,"BrightRetrieval (psychology)":21.82,"BrightRetrieval (robotics)":16.21,"BrightRetrieval (stackoverflow)":16.47,"BrightRetrieval (sustainable_living)":17.69,"BrightRetrieval (theoremqa_questions)":15.07,"BrightRetrieval (theoremqa_theorems)":6.04}
11
+ {"index":12,"Rank":11,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/sentence-transformers\/all-mpnet-base-v2\">all-mpnet-base-v2<\/a>","Model Size (Million Parameters)":110,"Memory Usage (GB, fp32)":"0.41","Average":14.8,"BrightRetrieval (aops)":5.32,"BrightRetrieval (biology)":15.52,"BrightRetrieval (earth_science)":20.11,"BrightRetrieval (economics)":16.64,"BrightRetrieval (leetcode)":26.4,"BrightRetrieval (pony)":6.95,"BrightRetrieval (psychology)":22.63,"BrightRetrieval (robotics)":8.36,"BrightRetrieval (stackoverflow)":9.48,"BrightRetrieval (sustainable_living)":15.34,"BrightRetrieval (theoremqa_questions)":18.49,"BrightRetrieval (theoremqa_theorems)":12.38}
12
+ {"index":2,"Rank":12,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/en.wikipedia.org\/wiki\/Okapi_BM25\">bm25<\/a>","Model Size (Million Parameters)":"","Memory Usage (GB, fp32)":"","Average":14.29,"BrightRetrieval (aops)":6.2,"BrightRetrieval (biology)":19.19,"BrightRetrieval (earth_science)":27.06,"BrightRetrieval (economics)":14.87,"BrightRetrieval (leetcode)":24.37,"BrightRetrieval (pony)":7.93,"BrightRetrieval (psychology)":12.51,"BrightRetrieval (robotics)":13.53,"BrightRetrieval (stackoverflow)":16.55,"BrightRetrieval (sustainable_living)":15.22,"BrightRetrieval (theoremqa_questions)":9.78,"BrightRetrieval (theoremqa_theorems)":4.25}
13
+ {"index":9,"Rank":13,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/hkunlp\/instructor-large\">instructor-large<\/a>","Model Size (Million Parameters)":335,"Memory Usage (GB, fp32)":"1.25","Average":14.12,"BrightRetrieval (aops)":7.94,"BrightRetrieval (biology)":15.61,"BrightRetrieval (earth_science)":21.52,"BrightRetrieval (economics)":15.99,"BrightRetrieval (leetcode)":20.0,"BrightRetrieval (pony)":1.32,"BrightRetrieval (psychology)":21.94,"BrightRetrieval (robotics)":11.45,"BrightRetrieval (stackoverflow)":11.21,"BrightRetrieval (sustainable_living)":13.16,"BrightRetrieval (theoremqa_questions)":20.07,"BrightRetrieval (theoremqa_theorems)":9.29}
14
+ {"index":5,"Rank":14,"Model":"<a target=\"_blank\" style=\"text-decoration: underline\" href=\"https:\/\/huggingface.co\/BAAI\/bge-large-en-v1.5\">bge-large-en-v1.5<\/a>","Model Size (Million Parameters)":"","Memory Usage (GB, fp32)":"","Average":13.47,"BrightRetrieval (aops)":6.08,"BrightRetrieval (biology)":11.96,"BrightRetrieval (earth_science)":24.15,"BrightRetrieval (economics)":16.59,"BrightRetrieval (leetcode)":26.68,"BrightRetrieval (pony)":5.64,"BrightRetrieval (psychology)":17.44,"BrightRetrieval (robotics)":12.21,"BrightRetrieval (stackoverflow)":9.51,"BrightRetrieval (sustainable_living)":13.27,"BrightRetrieval (theoremqa_questions)":12.56,"BrightRetrieval (theoremqa_theorems)":5.51}
config.yaml CHANGED
@@ -402,4 +402,27 @@ boards:
402
  special_icons: null
403
  credits: null
404
  tasks:
405
- STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  special_icons: null
403
  credits: null
404
  tasks:
405
+ STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"]
406
+ bright:
407
+ title: BRIGHT
408
+ language_long: "English"
409
+ has_overall: false
410
+ acronym: null
411
+ icon: "🌟"
412
+ special_icons: null
413
+ credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](https://brightbenchmark.github.io/)"
414
+ metric: nDCG@10
415
+ tasks:
416
+ Retrieval:
417
+ - BrightRetrieval (biology)
418
+ - BrightRetrieval (earth_science)
419
+ - BrightRetrieval (economics)
420
+ - BrightRetrieval (psychology)
421
+ - BrightRetrieval (robotics)
422
+ - BrightRetrieval (stackoverflow)
423
+ - BrightRetrieval (sustainable_living)
424
+ - BrightRetrieval (pony)
425
+ - BrightRetrieval (leetcode)
426
+ - BrightRetrieval (aops)
427
+ - BrightRetrieval (theoremqa_theorems)
428
+ - BrightRetrieval (theoremqa_questions)
model_meta.yaml CHANGED
@@ -1,12 +1,4 @@
1
  model_meta:
2
- gte-Qwen1.5-7B-instruct:
3
- link: https://huggingface.co/Alibaba-NLP/gte-Qwen1.5-7B-instruct
4
- seq_len: 32768
5
- size: 7099
6
- dim: 4096
7
- is_external: true
8
- is_proprietary: false
9
- is_sentence_transformers_compatible: true
10
  Baichuan-text-embedding:
11
  link: https://platform.baichuan-ai.com/docs/text-Embedding
12
  seq_len: 512
@@ -149,6 +141,14 @@ model_meta:
149
  is_external: true
150
  is_proprietary: true
151
  is_sentence_transformers_compatible: false
 
 
 
 
 
 
 
 
152
  all-MiniLM-L12-v2:
153
  link: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2
154
  seq_len: 512
@@ -571,6 +571,22 @@ model_meta:
571
  is_external: true
572
  is_proprietary: false
573
  is_sentence_transformers_compatible: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  gtr-t5-base:
575
  link: https://huggingface.co/sentence-transformers/gtr-t5-base
576
  seq_len: 512
@@ -619,6 +635,14 @@ model_meta:
619
  is_external: true
620
  is_proprietary: false
621
  is_sentence_transformers_compatible: true
 
 
 
 
 
 
 
 
622
  instructor-xl:
623
  link: https://huggingface.co/hkunlp/instructor-xl
624
  seq_len: 512
 
1
  model_meta:
 
 
 
 
 
 
 
 
2
  Baichuan-text-embedding:
3
  link: https://platform.baichuan-ai.com/docs/text-Embedding
4
  seq_len: 512
 
141
  is_external: true
142
  is_proprietary: true
143
  is_sentence_transformers_compatible: false
144
+ SFR-Embedding-Mistral:
145
+ link: https://huggingface.co/Salesforce/SFR-Embedding-Mistral
146
+ seq_len: 32768
147
+ size: 7111
148
+ dim: 4096
149
+ is_external: true
150
+ is_proprietary: false
151
+ is_sentence_transformers_compatible: true
152
  all-MiniLM-L12-v2:
153
  link: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2
154
  seq_len: 512
 
571
  is_external: true
572
  is_proprietary: false
573
  is_sentence_transformers_compatible: true
574
+ gte-Qwen1.5-7B-instruct:
575
+ link: https://huggingface.co/Alibaba-NLP/gte-Qwen1.5-7B-instruct
576
+ seq_len: 32768
577
+ size: 7099
578
+ dim: 4096
579
+ is_external: true
580
+ is_proprietary: false
581
+ is_sentence_transformers_compatible: true
582
+ gte-Qwen2-7B-instruct:
583
+ link: https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct
584
+ seq_len: 32768
585
+ size: 7613
586
+ dim: 3584
587
+ is_external: true
588
+ is_proprietary: false
589
+ is_sentence_transformers_compatible: true
590
  gtr-t5-base:
591
  link: https://huggingface.co/sentence-transformers/gtr-t5-base
592
  seq_len: 512
 
635
  is_external: true
636
  is_proprietary: false
637
  is_sentence_transformers_compatible: true
638
+ instructor-large:
639
+ link: https://huggingface.co/hkunlp/instructor-large
640
+ seq_len: 512
641
+ size: 335
642
+ dim: 768
643
+ is_external: true
644
+ is_proprietary: false
645
+ is_sentence_transformers_compatible: true
646
  instructor-xl:
647
  link: https://huggingface.co/hkunlp/instructor-xl
648
  seq_len: 512
refresh.py CHANGED
@@ -406,6 +406,8 @@ def refresh_leaderboard():
406
  all_data_tasks = []
407
  pbar_tasks = tqdm(BOARDS_CONFIG.items(), desc="Fetching leaderboard results for ???", total=len(BOARDS_CONFIG), leave=True)
408
  for board, board_config in pbar_tasks:
 
 
409
  boards_data[board] = {
410
  "data_overall": None,
411
  "data_tasks": {}
 
406
  all_data_tasks = []
407
  pbar_tasks = tqdm(BOARDS_CONFIG.items(), desc="Fetching leaderboard results for ???", total=len(BOARDS_CONFIG), leave=True)
408
  for board, board_config in pbar_tasks:
409
+ # To add only a single new board, you can uncomment the below to be faster
410
+ # if board != "new_board_name": continue
411
  boards_data[board] = {
412
  "data_overall": None,
413
  "data_tasks": {}