Spaces:
No application file
No application file
Muennighoff
commited on
Commit
·
099d855
1
Parent(s):
6e58d27
Add Model Size (GB)
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import json
|
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
import gradio as gr
|
5 |
-
from huggingface_hub import HfApi, hf_hub_download
|
6 |
from huggingface_hub.repocard import metadata_load
|
7 |
import pandas as pd
|
8 |
|
@@ -233,6 +233,7 @@ EXTERNAL_MODEL_TO_LINK = {
|
|
233 |
"all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
|
234 |
"paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
235 |
"paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
|
|
236 |
}
|
237 |
|
238 |
EXTERNAL_MODEL_TO_DIM = {
|
@@ -338,6 +339,39 @@ EXTERNAL_MODEL_TO_SEQLEN = {
|
|
338 |
"unsup-simcse-bert-base-uncased": 512,
|
339 |
}
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
MODELS_TO_SKIP = {
|
343 |
"baseplate/instructor-large-1", # Duplicate
|
@@ -404,9 +438,9 @@ for model in EXTERNAL_MODELS:
|
|
404 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
405 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
406 |
|
407 |
-
def
|
408 |
filenames = [sib.rfilename for sib in model.siblings]
|
409 |
-
dim, seq = "", ""
|
410 |
if "1_Pooling/config.json" in filenames:
|
411 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
412 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
@@ -419,7 +453,23 @@ def get_dim_seq(model):
|
|
419 |
if not dim:
|
420 |
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
|
421 |
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
|
422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
|
424 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
425 |
api = HfApi()
|
@@ -439,6 +489,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
439 |
# Model & at least one result
|
440 |
if len(res) > 1:
|
441 |
if add_emb_dim:
|
|
|
442 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
443 |
res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
|
444 |
df_list.append(res)
|
@@ -474,7 +525,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
474 |
# Model & at least one result
|
475 |
if len(out) > 1:
|
476 |
if add_emb_dim:
|
477 |
-
out["Embedding Dimensions"], out["Sequence Length"] =
|
478 |
df_list.append(out)
|
479 |
df = pd.DataFrame(df_list)
|
480 |
# Put 'Model' column first
|
@@ -532,7 +583,7 @@ def get_mteb_average():
|
|
532 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
533 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
534 |
|
535 |
-
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
536 |
|
537 |
return DATA_OVERALL
|
538 |
|
|
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
import gradio as gr
|
5 |
+
from huggingface_hub import get_hf_file_metadata, HfApi, hf_hub_download, hf_hub_url
|
6 |
from huggingface_hub.repocard import metadata_load
|
7 |
import pandas as pd
|
8 |
|
|
|
233 |
"all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
|
234 |
"paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
235 |
"paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
236 |
+
"contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
|
237 |
}
|
238 |
|
239 |
EXTERNAL_MODEL_TO_DIM = {
|
|
|
339 |
"unsup-simcse-bert-base-uncased": 512,
|
340 |
}
|
341 |
|
342 |
+
EXTERNAL_MODEL_TO_SIZE = {
|
343 |
+
"gtr-t5-xxl": 9.73,
|
344 |
+
"gtr-t5-xl": 2.48,
|
345 |
+
"gtr-t5-large": 0.67,
|
346 |
+
"gtr-t5-base": 0.22,
|
347 |
+
"sentence-t5-xxl": 9.73,
|
348 |
+
"sentence-t5-xl": 2.48,
|
349 |
+
"sentence-t5-large": 0.67,
|
350 |
+
"sentence-t5-base": 0.22,
|
351 |
+
"all-mpnet-base-v2": 0.44,
|
352 |
+
"all-MiniLM-L12-v2": 0.13,
|
353 |
+
"all-MiniLM-L6-v2": 0.09,
|
354 |
+
"contriever-base-msmarco": 0.44,
|
355 |
+
"paraphrase-multilingual-mpnet-base-v2": 1.11,
|
356 |
+
"paraphrase-multilingual-MiniLM-L12-v2": 0.47,
|
357 |
+
"msmarco-bert-co-condensor": 0.44,
|
358 |
+
"sup-simcse-bert-base-uncased": 0.44,
|
359 |
+
"unsup-simcse-bert-base-uncased": 0.44,
|
360 |
+
"LaBSE": 1.88,
|
361 |
+
"komninos": 0.27,
|
362 |
+
"glove.6B.300d": 0.48,
|
363 |
+
"allenai-specter": 0.44,
|
364 |
+
"bert-base-uncased": 0.44,
|
365 |
+
"LASER2": 0.17,
|
366 |
+
"cross-en-de-roberta-sentence-transformer": 1.11,
|
367 |
+
"gbert-base": 0.44,
|
368 |
+
"gbert-large": 1.35,
|
369 |
+
"gelectra-base": 0.44,
|
370 |
+
"gelectra-large": 1.34,
|
371 |
+
"use-cmlm-multilingual": 1.89,
|
372 |
+
"xlm-roberta-large": 2.24,
|
373 |
+
"gottbert-base": 0.51
|
374 |
+
}
|
375 |
|
376 |
MODELS_TO_SKIP = {
|
377 |
"baseplate/instructor-large-1", # Duplicate
|
|
|
438 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
439 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
440 |
|
441 |
+
def get_dim_seq_size(model):
|
442 |
filenames = [sib.rfilename for sib in model.siblings]
|
443 |
+
dim, seq, size = "", "", ""
|
444 |
if "1_Pooling/config.json" in filenames:
|
445 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
446 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
|
|
453 |
if not dim:
|
454 |
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
|
455 |
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
|
456 |
+
# Get model file size without downloading
|
457 |
+
if "pytorch_model.bin" in filenames:
|
458 |
+
url = hf_hub_url(model.modelId, filename="pytorch_model.bin")
|
459 |
+
meta = get_hf_file_metadata(url)
|
460 |
+
size = round(meta.size / 1e9, 2)
|
461 |
+
elif "pytorch_model.bin.index.json" in filenames:
|
462 |
+
index_path = hf_hub_download(model.modelId, filename="pytorch_model.bin.index.json")
|
463 |
+
"""
|
464 |
+
{
|
465 |
+
"metadata": {
|
466 |
+
"total_size": 28272820224
|
467 |
+
},....
|
468 |
+
"""
|
469 |
+
size = json.load(open(index_path))
|
470 |
+
if ("metadata" in size) and ("total_size" in size["metadata"]):
|
471 |
+
size = round(size["metadata"]["total_size"] / 1e9, 2)
|
472 |
+
return dim, seq, size
|
473 |
|
474 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
475 |
api = HfApi()
|
|
|
489 |
# Model & at least one result
|
490 |
if len(res) > 1:
|
491 |
if add_emb_dim:
|
492 |
+
res["Model Size (GB)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
|
493 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
494 |
res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
|
495 |
df_list.append(res)
|
|
|
525 |
# Model & at least one result
|
526 |
if len(out) > 1:
|
527 |
if add_emb_dim:
|
528 |
+
out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
|
529 |
df_list.append(out)
|
530 |
df = pd.DataFrame(df_list)
|
531 |
# Put 'Model' column first
|
|
|
583 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
584 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
585 |
|
586 |
+
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
587 |
|
588 |
return DATA_OVERALL
|
589 |
|