Muennighoff commited on
Commit
bdf66dc
1 Parent(s): d32c2e5

Update OpusparcusPC & LLM2Vec

Browse files
Files changed (2) hide show
  1. EXTERNAL_MODEL_RESULTS.json +0 -0
  2. app.py +28 -27
EXTERNAL_MODEL_RESULTS.json CHANGED
The diff for this file is too large to render. See raw diff
app.py CHANGED
@@ -340,13 +340,13 @@ EXTERNAL_MODELS = [
340
  "Cohere-embed-multilingual-light-v3.0",
341
  "DanskBERT",
342
  "LASER2",
 
 
 
 
 
 
343
  "LaBSE",
344
- # "LLM2Vec-Llama-supervised",
345
- # "LLM2Vec-Llama-unsupervised",
346
- # "LLM2Vec-Mistral-supervised",
347
- # "LLM2Vec-Mistral-unsupervised",
348
- # "LLM2Vec-Sheared-Llama-supervised",
349
- # "LLM2Vec-Sheared-Llama-unsupervised",
350
  "OpenSearch-text-hybrid",
351
  "all-MiniLM-L12-v2",
352
  "all-MiniLM-L6-v2",
@@ -456,21 +456,25 @@ EXTERNAL_MODELS = [
456
  ]
457
 
458
  EXTERNAL_MODEL_TO_LINK = {
 
459
  "Cohere-embed-english-v3.0": "https://huggingface.co/Cohere/Cohere-embed-english-v3.0",
460
  "Cohere-embed-multilingual-v3.0": "https://huggingface.co/Cohere/Cohere-embed-multilingual-v3.0",
461
  "Cohere-embed-multilingual-light-v3.0": "https://huggingface.co/Cohere/Cohere-embed-multilingual-light-v3.0",
 
 
462
  "LLM2Vec-Llama-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised",
463
  "LLM2Vec-Llama-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp",
464
  "LLM2Vec-Mistral-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised",
465
  "LLM2Vec-Mistral-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp",
466
  "LLM2Vec-Sheared-Llama-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised",
467
  "LLM2Vec-Sheared-Llama-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
 
 
468
  "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
469
  "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
470
  "all-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2",
471
  "all-MiniLM-L6-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
472
  "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
473
- "Baichuan-text-embedding": "https://platform.baichuan-ai.com/docs/text-Embedding",
474
  "bert-base-10lang-cased": "https://huggingface.co/Geotrend/bert-base-10lang-cased",
475
  "bert-base-15lang-cased": "https://huggingface.co/Geotrend/bert-base-15lang-cased",
476
  "bert-base-25lang-cased": "https://huggingface.co/Geotrend/bert-base-25lang-cased",
@@ -487,7 +491,6 @@ EXTERNAL_MODEL_TO_LINK = {
487
  "camembert-large": "https://huggingface.co/almanach/camembert-large",
488
  "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
489
  "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer",
490
- "DanskBERT": "https://huggingface.co/vesteinn/DanskBERT",
491
  "distilbert-base-25lang-cased": "https://huggingface.co/Geotrend/distilbert-base-25lang-cased",
492
  "distilbert-base-en-fr-cased": "https://huggingface.co/Geotrend/distilbert-base-en-fr-cased",
493
  "distilbert-base-en-fr-es-pt-it-cased": "https://huggingface.co/Geotrend/distilbert-base-en-fr-es-pt-it-cased",
@@ -520,8 +523,6 @@ EXTERNAL_MODEL_TO_LINK = {
520
  "herbert-base-retrieval-v2": "https://huggingface.co/ipipan/herbert-base-retrieval-v2",
521
  "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos",
522
  "luotuo-bert-medium": "https://huggingface.co/silk-road/luotuo-bert-medium",
523
- "LASER2": "https://github.com/facebookresearch/LASER",
524
- "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE",
525
  "m3e-base": "https://huggingface.co/moka-ai/m3e-base",
526
  "m3e-large": "https://huggingface.co/moka-ai/m3e-large",
527
  "mistral-embed": "https://docs.mistral.ai/guides/embeddings",
@@ -538,7 +539,6 @@ EXTERNAL_MODEL_TO_LINK = {
538
  "nomic-embed-text-v1.5-512": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5",
539
  "norbert3-base": "https://huggingface.co/ltg/norbert3-base",
540
  "norbert3-large": "https://huggingface.co/ltg/norbert3-large",
541
- "OpenSearch-text-hybrid": "https://help.aliyun.com/zh/open-search/vector-search-edition/hybrid-retrieval",
542
  "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
543
  "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
544
  "sentence-camembert-base": "https://huggingface.co/dangvantuan/sentence-camembert-base",
@@ -586,20 +586,23 @@ EXTERNAL_MODEL_TO_LINK = {
586
  }
587
 
588
  EXTERNAL_MODEL_TO_DIM = {
 
589
  "Cohere-embed-english-v3.0": 1024,
590
  "Cohere-embed-multilingual-v3.0": 1024,
591
  "Cohere-embed-multilingual-light-v3.0": 384,
 
 
592
  "LLM2Vec-Llama-supervised": 4096,
593
  "LLM2Vec-Llama-unsupervised": 4096,
594
  "LLM2Vec-Mistral-supervised": 4096,
595
  "LLM2Vec-Mistral-unsupervised": 4096,
596
  "LLM2Vec-Sheared-Llama-supervised": 2048,
597
  "LLM2Vec-Sheared-Llama-unsupervised": 2048,
 
598
  "all-MiniLM-L12-v2": 384,
599
  "all-MiniLM-L6-v2": 384,
600
  "all-mpnet-base-v2": 768,
601
  "allenai-specter": 768,
602
- "Baichuan-text-embedding": 1024,
603
  "bert-base-10lang-cased": 768,
604
  "bert-base-15lang-cased": 768,
605
  "bert-base-25lang-cased": 768,
@@ -616,7 +619,6 @@ EXTERNAL_MODEL_TO_DIM = {
616
  "camembert-large": 768,
617
  "contriever-base-msmarco": 768,
618
  "cross-en-de-roberta-sentence-transformer": 768,
619
- "DanskBERT": 768,
620
  "distilbert-base-25lang-cased": 768,
621
  "distilbert-base-en-fr-cased": 768,
622
  "distilbert-base-en-fr-es-pt-it-cased": 768,
@@ -635,8 +637,6 @@ EXTERNAL_MODEL_TO_DIM = {
635
  "flaubert_base_uncased": 768,
636
  "flaubert_large_cased": 1024,
637
  "luotuo-bert-medium": 768,
638
- "LASER2": 1024,
639
- "LaBSE": 768,
640
  "gbert-base": 768,
641
  "gbert-large": 1024,
642
  "gelectra-base": 768,
@@ -715,20 +715,23 @@ EXTERNAL_MODEL_TO_DIM = {
715
  }
716
 
717
  EXTERNAL_MODEL_TO_SEQLEN = {
 
718
  "Cohere-embed-english-v3.0": 512,
719
  "Cohere-embed-multilingual-v3.0": 512,
720
- "Cohere-embed-multilingual-light-v3.0": 512,
 
 
721
  "LLM2Vec-Llama-supervised": 512,
722
  "LLM2Vec-Llama-unsupervised": 512,
723
  "LLM2Vec-Mistral-supervised": 512,
724
  "LLM2Vec-Mistral-unsupervised": 512,
725
  "LLM2Vec-Sheared-Llama-supervised": 512,
726
  "LLM2Vec-Sheared-Llama-unsupervised": 512,
 
727
  "all-MiniLM-L12-v2": 512,
728
  "all-MiniLM-L6-v2": 512,
729
  "all-mpnet-base-v2": 514,
730
  "allenai-specter": 512,
731
- "Baichuan-text-embedding": 512,
732
  "bert-base-10lang-cased": 512,
733
  "bert-base-15lang-cased": 512,
734
  "bert-base-25lang-cased": 512,
@@ -749,8 +752,7 @@ EXTERNAL_MODEL_TO_SEQLEN = {
749
  "distilbert-base-en-fr-cased": 512,
750
  "distilbert-base-en-fr-es-pt-it-cased": 512,
751
  "distilbert-base-fr-cased": 512,
752
- "distilbert-base-uncased": 512,
753
- "DanskBERT": 514,
754
  "dfm-encoder-large-v1": 512,
755
  "dfm-sentence-encoder-large-1": 512,
756
  "distiluse-base-multilingual-cased-v2": 512,
@@ -778,8 +780,6 @@ EXTERNAL_MODEL_TO_SEQLEN = {
778
  "herbert-base-retrieval-v2": 514,
779
  "komninos": "N/A",
780
  "luotuo-bert-medium": 512,
781
- "LASER2": "N/A",
782
- "LaBSE": 512,
783
  "m3e-base": 512,
784
  "m3e-large": 512,
785
  # "mistral-embed": "?",
@@ -844,12 +844,15 @@ EXTERNAL_MODEL_TO_SEQLEN = {
844
  }
845
 
846
  EXTERNAL_MODEL_TO_SIZE = {
 
 
847
  "LLM2Vec-Llama-supervised": 6607,
848
  "LLM2Vec-Llama-unsupervised": 6607,
849
  "LLM2Vec-Mistral-supervised": 7111,
850
  "LLM2Vec-Mistral-unsupervised": 7111,
851
  "LLM2Vec-Sheared-Llama-supervised": 1280,
852
  "LLM2Vec-Sheared-Llama-unsupervised": 1280,
 
853
  "allenai-specter": 110,
854
  "all-MiniLM-L12-v2": 33,
855
  "all-MiniLM-L6-v2": 23,
@@ -874,7 +877,6 @@ EXTERNAL_MODEL_TO_SIZE = {
874
  "distilbert-base-en-fr-es-pt-it-cased": 110,
875
  "distilbert-base-fr-cased": 110,
876
  "distilbert-base-uncased": 110,
877
- "DanskBERT": 125,
878
  "distiluse-base-multilingual-cased-v2": 135,
879
  "dfm-encoder-large-v1": 355,
880
  "dfm-sentence-encoder-large-1": 355,
@@ -901,9 +903,7 @@ EXTERNAL_MODEL_TO_SIZE = {
901
  "gtr-t5-xxl": 4865,
902
  "herbert-base-retrieval-v2": 125,
903
  "komninos": 134,
904
- "luotuo-bert-medium": 328,
905
- "LASER2": 43,
906
- "LaBSE": 471,
907
  "m3e-base": 102,
908
  "m3e-large": 102,
909
  "msmarco-bert-co-condensor": 110,
@@ -944,12 +944,12 @@ EXTERNAL_MODEL_TO_SIZE = {
944
  }
945
 
946
  PROPRIETARY_MODELS = {
 
947
  "Cohere-embed-english-v3.0",
948
  "Cohere-embed-multilingual-v3.0",
949
  "Cohere-embed-multilingual-light-v3.0",
950
- "Baichuan-text-embedding",
951
- "mistral-embed",
952
  "OpenSearch-text-hybrid",
 
953
  "text-embedding-3-small",
954
  "text-embedding-3-large",
955
  "text-embedding-3-large-256",
@@ -973,6 +973,7 @@ PROPRIETARY_MODELS = {
973
  "google-gecko.text-embedding-preview-0409",
974
  "google-gecko-256.text-embedding-preview-0409",
975
  }
 
976
  PROPRIETARY_MODELS = {
977
  make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))
978
  for model in PROPRIETARY_MODELS
340
  "Cohere-embed-multilingual-light-v3.0",
341
  "DanskBERT",
342
  "LASER2",
343
+ "LLM2Vec-Llama-supervised",
344
+ "LLM2Vec-Llama-unsupervised",
345
+ "LLM2Vec-Mistral-supervised",
346
+ "LLM2Vec-Mistral-unsupervised",
347
+ "LLM2Vec-Sheared-Llama-supervised",
348
+ "LLM2Vec-Sheared-Llama-unsupervised",
349
  "LaBSE",
 
 
 
 
 
 
350
  "OpenSearch-text-hybrid",
351
  "all-MiniLM-L12-v2",
352
  "all-MiniLM-L6-v2",
456
  ]
457
 
458
  EXTERNAL_MODEL_TO_LINK = {
459
+ "Baichuan-text-embedding": "https://platform.baichuan-ai.com/docs/text-Embedding",
460
  "Cohere-embed-english-v3.0": "https://huggingface.co/Cohere/Cohere-embed-english-v3.0",
461
  "Cohere-embed-multilingual-v3.0": "https://huggingface.co/Cohere/Cohere-embed-multilingual-v3.0",
462
  "Cohere-embed-multilingual-light-v3.0": "https://huggingface.co/Cohere/Cohere-embed-multilingual-light-v3.0",
463
+ "DanskBERT": "https://huggingface.co/vesteinn/DanskBERT",
464
+ "LASER2": "https://github.com/facebookresearch/LASER",
465
  "LLM2Vec-Llama-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised",
466
  "LLM2Vec-Llama-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp",
467
  "LLM2Vec-Mistral-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised",
468
  "LLM2Vec-Mistral-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp",
469
  "LLM2Vec-Sheared-Llama-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised",
470
  "LLM2Vec-Sheared-Llama-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp",
471
+ "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE",
472
+ "OpenSearch-text-hybrid": "https://help.aliyun.com/zh/open-search/vector-search-edition/hybrid-retrieval",
473
  "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
474
  "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
475
  "all-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2",
476
  "all-MiniLM-L6-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
477
  "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
 
478
  "bert-base-10lang-cased": "https://huggingface.co/Geotrend/bert-base-10lang-cased",
479
  "bert-base-15lang-cased": "https://huggingface.co/Geotrend/bert-base-15lang-cased",
480
  "bert-base-25lang-cased": "https://huggingface.co/Geotrend/bert-base-25lang-cased",
491
  "camembert-large": "https://huggingface.co/almanach/camembert-large",
492
  "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
493
  "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer",
 
494
  "distilbert-base-25lang-cased": "https://huggingface.co/Geotrend/distilbert-base-25lang-cased",
495
  "distilbert-base-en-fr-cased": "https://huggingface.co/Geotrend/distilbert-base-en-fr-cased",
496
  "distilbert-base-en-fr-es-pt-it-cased": "https://huggingface.co/Geotrend/distilbert-base-en-fr-es-pt-it-cased",
523
  "herbert-base-retrieval-v2": "https://huggingface.co/ipipan/herbert-base-retrieval-v2",
524
  "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos",
525
  "luotuo-bert-medium": "https://huggingface.co/silk-road/luotuo-bert-medium",
 
 
526
  "m3e-base": "https://huggingface.co/moka-ai/m3e-base",
527
  "m3e-large": "https://huggingface.co/moka-ai/m3e-large",
528
  "mistral-embed": "https://docs.mistral.ai/guides/embeddings",
539
  "nomic-embed-text-v1.5-512": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5",
540
  "norbert3-base": "https://huggingface.co/ltg/norbert3-base",
541
  "norbert3-large": "https://huggingface.co/ltg/norbert3-large",
 
542
  "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
543
  "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
544
  "sentence-camembert-base": "https://huggingface.co/dangvantuan/sentence-camembert-base",
586
  }
587
 
588
  EXTERNAL_MODEL_TO_DIM = {
589
+ "Baichuan-text-embedding": 1024,
590
  "Cohere-embed-english-v3.0": 1024,
591
  "Cohere-embed-multilingual-v3.0": 1024,
592
  "Cohere-embed-multilingual-light-v3.0": 384,
593
+ "DanskBERT": 768,
594
+ "LASER2": 1024,
595
  "LLM2Vec-Llama-supervised": 4096,
596
  "LLM2Vec-Llama-unsupervised": 4096,
597
  "LLM2Vec-Mistral-supervised": 4096,
598
  "LLM2Vec-Mistral-unsupervised": 4096,
599
  "LLM2Vec-Sheared-Llama-supervised": 2048,
600
  "LLM2Vec-Sheared-Llama-unsupervised": 2048,
601
+ "LaBSE": 768,
602
  "all-MiniLM-L12-v2": 384,
603
  "all-MiniLM-L6-v2": 384,
604
  "all-mpnet-base-v2": 768,
605
  "allenai-specter": 768,
 
606
  "bert-base-10lang-cased": 768,
607
  "bert-base-15lang-cased": 768,
608
  "bert-base-25lang-cased": 768,
619
  "camembert-large": 768,
620
  "contriever-base-msmarco": 768,
621
  "cross-en-de-roberta-sentence-transformer": 768,
 
622
  "distilbert-base-25lang-cased": 768,
623
  "distilbert-base-en-fr-cased": 768,
624
  "distilbert-base-en-fr-es-pt-it-cased": 768,
637
  "flaubert_base_uncased": 768,
638
  "flaubert_large_cased": 1024,
639
  "luotuo-bert-medium": 768,
 
 
640
  "gbert-base": 768,
641
  "gbert-large": 1024,
642
  "gelectra-base": 768,
715
  }
716
 
717
  EXTERNAL_MODEL_TO_SEQLEN = {
718
+ "Baichuan-text-embedding": 512,
719
  "Cohere-embed-english-v3.0": 512,
720
  "Cohere-embed-multilingual-v3.0": 512,
721
+ "Cohere-embed-multilingual-light-v3.0": 512,
722
+ "DanskBERT": 514,
723
+ "LASER2": "N/A",
724
  "LLM2Vec-Llama-supervised": 512,
725
  "LLM2Vec-Llama-unsupervised": 512,
726
  "LLM2Vec-Mistral-supervised": 512,
727
  "LLM2Vec-Mistral-unsupervised": 512,
728
  "LLM2Vec-Sheared-Llama-supervised": 512,
729
  "LLM2Vec-Sheared-Llama-unsupervised": 512,
730
+ "LaBSE": 512,
731
  "all-MiniLM-L12-v2": 512,
732
  "all-MiniLM-L6-v2": 512,
733
  "all-mpnet-base-v2": 514,
734
  "allenai-specter": 512,
 
735
  "bert-base-10lang-cased": 512,
736
  "bert-base-15lang-cased": 512,
737
  "bert-base-25lang-cased": 512,
752
  "distilbert-base-en-fr-cased": 512,
753
  "distilbert-base-en-fr-es-pt-it-cased": 512,
754
  "distilbert-base-fr-cased": 512,
755
+ "distilbert-base-uncased": 512,
 
756
  "dfm-encoder-large-v1": 512,
757
  "dfm-sentence-encoder-large-1": 512,
758
  "distiluse-base-multilingual-cased-v2": 512,
780
  "herbert-base-retrieval-v2": 514,
781
  "komninos": "N/A",
782
  "luotuo-bert-medium": 512,
 
 
783
  "m3e-base": 512,
784
  "m3e-large": 512,
785
  # "mistral-embed": "?",
844
  }
845
 
846
  EXTERNAL_MODEL_TO_SIZE = {
847
+ "DanskBERT": 125,
848
+ "LASER2": 43,
849
  "LLM2Vec-Llama-supervised": 6607,
850
  "LLM2Vec-Llama-unsupervised": 6607,
851
  "LLM2Vec-Mistral-supervised": 7111,
852
  "LLM2Vec-Mistral-unsupervised": 7111,
853
  "LLM2Vec-Sheared-Llama-supervised": 1280,
854
  "LLM2Vec-Sheared-Llama-unsupervised": 1280,
855
+ "LaBSE": 471,
856
  "allenai-specter": 110,
857
  "all-MiniLM-L12-v2": 33,
858
  "all-MiniLM-L6-v2": 23,
877
  "distilbert-base-en-fr-es-pt-it-cased": 110,
878
  "distilbert-base-fr-cased": 110,
879
  "distilbert-base-uncased": 110,
 
880
  "distiluse-base-multilingual-cased-v2": 135,
881
  "dfm-encoder-large-v1": 355,
882
  "dfm-sentence-encoder-large-1": 355,
903
  "gtr-t5-xxl": 4865,
904
  "herbert-base-retrieval-v2": 125,
905
  "komninos": 134,
906
+ "luotuo-bert-medium": 328,
 
 
907
  "m3e-base": 102,
908
  "m3e-large": 102,
909
  "msmarco-bert-co-condensor": 110,
944
  }
945
 
946
  PROPRIETARY_MODELS = {
947
+ "Baichuan-text-embedding",
948
  "Cohere-embed-english-v3.0",
949
  "Cohere-embed-multilingual-v3.0",
950
  "Cohere-embed-multilingual-light-v3.0",
 
 
951
  "OpenSearch-text-hybrid",
952
+ "mistral-embed",
953
  "text-embedding-3-small",
954
  "text-embedding-3-large",
955
  "text-embedding-3-large-256",
973
  "google-gecko.text-embedding-preview-0409",
974
  "google-gecko-256.text-embedding-preview-0409",
975
  }
976
+
977
  PROPRIETARY_MODELS = {
978
  make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))
979
  for model in PROPRIETARY_MODELS