Muennighoff commited on
Commit
909b95d
1 Parent(s): 92494a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -12
app.py CHANGED
@@ -334,6 +334,8 @@ EXTERNAL_MODELS = [
334
  "st-polish-paraphrase-from-mpnet",
335
  "text2vec-base-chinese",
336
  "text2vec-large-chinese",
 
 
337
  "text-embedding-ada-002",
338
  "text-similarity-ada-001",
339
  "text-similarity-babbage-001",
@@ -414,17 +416,19 @@ EXTERNAL_MODEL_TO_LINK = {
414
  "st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet",
415
  "text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
416
  "text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
417
- "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
418
- "text-similarity-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
419
- "text-similarity-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
420
- "text-similarity-curie-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
421
- "text-similarity-davinci-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
422
- "text-search-ada-doc-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
423
- "text-search-ada-query-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
424
- "text-search-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
425
- "text-search-curie-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
426
- "text-search-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
427
- "text-search-davinci-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
 
 
428
  "titan-embed-text-v1": "https://docs.aws.amazon.com/bedrock/latest/userguide/embeddings.html",
429
  "unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased",
430
  "use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
@@ -494,6 +498,8 @@ EXTERNAL_MODEL_TO_DIM = {
494
  "st-polish-paraphrase-from-mpnet": 768,
495
  "text2vec-base-chinese": 768,
496
  "text2vec-large-chinese": 1024,
 
 
497
  "text-embedding-ada-002": 1536,
498
  "text-similarity-ada-001": 1024,
499
  "text-similarity-babbage-001": 2048,
@@ -574,6 +580,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
574
  "st-polish-paraphrase-from-mpnet": 514,
575
  "text2vec-base-chinese": 512,
576
  "text2vec-large-chinese": 512,
 
 
577
  "text-embedding-ada-002": 8191,
578
  "text-similarity-ada-001": 2046,
579
  "text-similarity-babbage-001": 2046,
@@ -943,7 +951,11 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
943
  # Model & at least one result
944
  if len(out) > 1:
945
  if add_emb_dim:
946
- out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
 
 
 
 
947
  df_list.append(out)
948
  df = pd.DataFrame(df_list)
949
  # If there are any models that are the same, merge them
 
334
  "st-polish-paraphrase-from-mpnet",
335
  "text2vec-base-chinese",
336
  "text2vec-large-chinese",
337
+ "text-embedding-3-small",
338
+ "text-embedding-3-large",
339
  "text-embedding-ada-002",
340
  "text-similarity-ada-001",
341
  "text-similarity-babbage-001",
 
416
  "st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet",
417
  "text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
418
  "text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
419
+ "text-embedding-3-small": "https://openai.com/blog/new-embedding-models-and-api-updates",
420
+ "text-embedding-3-large": "https://openai.com/blog/new-embedding-models-and-api-updates",
421
+ "text-embedding-ada-002": "https://openai.com/blog/new-and-improved-embedding-model",
422
+ "text-similarity-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
423
+ "text-similarity-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
424
+ "text-similarity-curie-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
425
+ "text-similarity-davinci-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
426
+ "text-search-ada-doc-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
427
+ "text-search-ada-query-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
428
+ "text-search-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
429
+ "text-search-curie-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
430
+ "text-search-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
431
+ "text-search-davinci-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
432
  "titan-embed-text-v1": "https://docs.aws.amazon.com/bedrock/latest/userguide/embeddings.html",
433
  "unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased",
434
  "use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
 
498
  "st-polish-paraphrase-from-mpnet": 768,
499
  "text2vec-base-chinese": 768,
500
  "text2vec-large-chinese": 1024,
501
+ "text-embedding-3-large": 3072,
502
+ "text-embedding-3-small": 1536,
503
  "text-embedding-ada-002": 1536,
504
  "text-similarity-ada-001": 1024,
505
  "text-similarity-babbage-001": 2048,
 
580
  "st-polish-paraphrase-from-mpnet": 514,
581
  "text2vec-base-chinese": 512,
582
  "text2vec-large-chinese": 512,
583
+ "text-embedding-3-large": 8191,
584
+ "text-embedding-3-small": 8191,
585
  "text-embedding-ada-002": 8191,
586
  "text-similarity-ada-001": 2046,
587
  "text-similarity-babbage-001": 2046,
 
951
  # Model & at least one result
952
  if len(out) > 1:
953
  if add_emb_dim:
954
+ try:
955
+ # Fails on gated repos, so we only include scores for them
956
+ out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
957
+ except:
958
+ pass
959
  df_list.append(out)
960
  df = pd.DataFrame(df_list)
961
  # If there are any models that are the same, merge them