Spaces:
Running
Running
Muennighoff
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -334,6 +334,8 @@ EXTERNAL_MODELS = [
|
|
334 |
"st-polish-paraphrase-from-mpnet",
|
335 |
"text2vec-base-chinese",
|
336 |
"text2vec-large-chinese",
|
|
|
|
|
337 |
"text-embedding-ada-002",
|
338 |
"text-similarity-ada-001",
|
339 |
"text-similarity-babbage-001",
|
@@ -414,17 +416,19 @@ EXTERNAL_MODEL_TO_LINK = {
|
|
414 |
"st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet",
|
415 |
"text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
|
416 |
"text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
|
417 |
-
"text-embedding-
|
418 |
-
"text-
|
419 |
-
"text-
|
420 |
-
"text-similarity-
|
421 |
-
"text-similarity-
|
422 |
-
"text-
|
423 |
-
"text-
|
424 |
-
"text-search-ada-001": "https://
|
425 |
-
"text-search-
|
426 |
-
"text-search-
|
427 |
-
"text-search-
|
|
|
|
|
428 |
"titan-embed-text-v1": "https://docs.aws.amazon.com/bedrock/latest/userguide/embeddings.html",
|
429 |
"unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased",
|
430 |
"use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
|
@@ -494,6 +498,8 @@ EXTERNAL_MODEL_TO_DIM = {
|
|
494 |
"st-polish-paraphrase-from-mpnet": 768,
|
495 |
"text2vec-base-chinese": 768,
|
496 |
"text2vec-large-chinese": 1024,
|
|
|
|
|
497 |
"text-embedding-ada-002": 1536,
|
498 |
"text-similarity-ada-001": 1024,
|
499 |
"text-similarity-babbage-001": 2048,
|
@@ -574,6 +580,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
|
|
574 |
"st-polish-paraphrase-from-mpnet": 514,
|
575 |
"text2vec-base-chinese": 512,
|
576 |
"text2vec-large-chinese": 512,
|
|
|
|
|
577 |
"text-embedding-ada-002": 8191,
|
578 |
"text-similarity-ada-001": 2046,
|
579 |
"text-similarity-babbage-001": 2046,
|
@@ -943,7 +951,11 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
943 |
# Model & at least one result
|
944 |
if len(out) > 1:
|
945 |
if add_emb_dim:
|
946 |
-
|
|
|
|
|
|
|
|
|
947 |
df_list.append(out)
|
948 |
df = pd.DataFrame(df_list)
|
949 |
# If there are any models that are the same, merge them
|
|
|
334 |
"st-polish-paraphrase-from-mpnet",
|
335 |
"text2vec-base-chinese",
|
336 |
"text2vec-large-chinese",
|
337 |
+
"text-embedding-3-small",
|
338 |
+
"text-embedding-3-large",
|
339 |
"text-embedding-ada-002",
|
340 |
"text-similarity-ada-001",
|
341 |
"text-similarity-babbage-001",
|
|
|
416 |
"st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet",
|
417 |
"text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
|
418 |
"text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
|
419 |
+
"text-embedding-3-small": "https://openai.com/blog/new-embedding-models-and-api-updates",
|
420 |
+
"text-embedding-3-large": "https://openai.com/blog/new-embedding-models-and-api-updates",
|
421 |
+
"text-embedding-ada-002": "https://openai.com/blog/new-and-improved-embedding-model",
|
422 |
+
"text-similarity-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
423 |
+
"text-similarity-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
424 |
+
"text-similarity-curie-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
425 |
+
"text-similarity-davinci-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
426 |
+
"text-search-ada-doc-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
427 |
+
"text-search-ada-query-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
428 |
+
"text-search-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
429 |
+
"text-search-curie-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
430 |
+
"text-search-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
431 |
+
"text-search-davinci-001": "https://openai.com/blog/introducing-text-and-code-embeddings",
|
432 |
"titan-embed-text-v1": "https://docs.aws.amazon.com/bedrock/latest/userguide/embeddings.html",
|
433 |
"unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased",
|
434 |
"use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
|
|
|
498 |
"st-polish-paraphrase-from-mpnet": 768,
|
499 |
"text2vec-base-chinese": 768,
|
500 |
"text2vec-large-chinese": 1024,
|
501 |
+
"text-embedding-3-large": 3072,
|
502 |
+
"text-embedding-3-small": 1536,
|
503 |
"text-embedding-ada-002": 1536,
|
504 |
"text-similarity-ada-001": 1024,
|
505 |
"text-similarity-babbage-001": 2048,
|
|
|
580 |
"st-polish-paraphrase-from-mpnet": 514,
|
581 |
"text2vec-base-chinese": 512,
|
582 |
"text2vec-large-chinese": 512,
|
583 |
+
"text-embedding-3-large": 8191,
|
584 |
+
"text-embedding-3-small": 8191,
|
585 |
"text-embedding-ada-002": 8191,
|
586 |
"text-similarity-ada-001": 2046,
|
587 |
"text-similarity-babbage-001": 2046,
|
|
|
951 |
# Model & at least one result
|
952 |
if len(out) > 1:
|
953 |
if add_emb_dim:
|
954 |
+
try:
|
955 |
+
# Fails on gated repos, so we only include scores for them
|
956 |
+
out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
|
957 |
+
except:
|
958 |
+
pass
|
959 |
df_list.append(out)
|
960 |
df = pd.DataFrame(df_list)
|
961 |
# If there are any models that are the same, merge them
|