Muennighoff commited on
Commit
099d855
1 Parent(s): 6e58d27

Add Model Size (GB)

Browse files
Files changed (1) hide show
  1. app.py +57 -6
app.py CHANGED
@@ -2,7 +2,7 @@ import json
2
 
3
  from datasets import load_dataset
4
  import gradio as gr
5
- from huggingface_hub import HfApi, hf_hub_download
6
  from huggingface_hub.repocard import metadata_load
7
  import pandas as pd
8
 
@@ -233,6 +233,7 @@ EXTERNAL_MODEL_TO_LINK = {
233
  "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
234
  "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
235
  "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
 
236
  }
237
 
238
  EXTERNAL_MODEL_TO_DIM = {
@@ -338,6 +339,39 @@ EXTERNAL_MODEL_TO_SEQLEN = {
338
  "unsup-simcse-bert-base-uncased": 512,
339
  }
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  MODELS_TO_SKIP = {
343
  "baseplate/instructor-large-1", # Duplicate
@@ -404,9 +438,9 @@ for model in EXTERNAL_MODELS:
404
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
405
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
406
 
407
- def get_dim_seq(model):
408
  filenames = [sib.rfilename for sib in model.siblings]
409
- dim, seq = "", ""
410
  if "1_Pooling/config.json" in filenames:
411
  st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
412
  dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
@@ -419,7 +453,23 @@ def get_dim_seq(model):
419
  if not dim:
420
  dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
421
  seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
422
- return dim, seq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
  def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
425
  api = HfApi()
@@ -439,6 +489,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
439
  # Model & at least one result
440
  if len(res) > 1:
441
  if add_emb_dim:
 
442
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
443
  res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
444
  df_list.append(res)
@@ -474,7 +525,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
474
  # Model & at least one result
475
  if len(out) > 1:
476
  if add_emb_dim:
477
- out["Embedding Dimensions"], out["Sequence Length"] = get_dim_seq(model)
478
  df_list.append(out)
479
  df = pd.DataFrame(df_list)
480
  # Put 'Model' column first
@@ -532,7 +583,7 @@ def get_mteb_average():
532
  DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
533
  DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
534
 
535
- DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
536
 
537
  return DATA_OVERALL
538
 
 
2
 
3
  from datasets import load_dataset
4
  import gradio as gr
5
+ from huggingface_hub import get_hf_file_metadata, HfApi, hf_hub_download, hf_hub_url
6
  from huggingface_hub.repocard import metadata_load
7
  import pandas as pd
8
 
 
233
  "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
234
  "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
235
  "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
236
+ "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
237
  }
238
 
239
  EXTERNAL_MODEL_TO_DIM = {
 
339
  "unsup-simcse-bert-base-uncased": 512,
340
  }
341
 
342
+ EXTERNAL_MODEL_TO_SIZE = {
343
+ "gtr-t5-xxl": 9.73,
344
+ "gtr-t5-xl": 2.48,
345
+ "gtr-t5-large": 0.67,
346
+ "gtr-t5-base": 0.22,
347
+ "sentence-t5-xxl": 9.73,
348
+ "sentence-t5-xl": 2.48,
349
+ "sentence-t5-large": 0.67,
350
+ "sentence-t5-base": 0.22,
351
+ "all-mpnet-base-v2": 0.44,
352
+ "all-MiniLM-L12-v2": 0.13,
353
+ "all-MiniLM-L6-v2": 0.09,
354
+ "contriever-base-msmarco": 0.44,
355
+ "paraphrase-multilingual-mpnet-base-v2": 1.11,
356
+ "paraphrase-multilingual-MiniLM-L12-v2": 0.47,
357
+ "msmarco-bert-co-condensor": 0.44,
358
+ "sup-simcse-bert-base-uncased": 0.44,
359
+ "unsup-simcse-bert-base-uncased": 0.44,
360
+ "LaBSE": 1.88,
361
+ "komninos": 0.27,
362
+ "glove.6B.300d": 0.48,
363
+ "allenai-specter": 0.44,
364
+ "bert-base-uncased": 0.44,
365
+ "LASER2": 0.17,
366
+ "cross-en-de-roberta-sentence-transformer": 1.11,
367
+ "gbert-base": 0.44,
368
+ "gbert-large": 1.35,
369
+ "gelectra-base": 0.44,
370
+ "gelectra-large": 1.34,
371
+ "use-cmlm-multilingual": 1.89,
372
+ "xlm-roberta-large": 2.24,
373
+ "gottbert-base": 0.51
374
+ }
375
 
376
  MODELS_TO_SKIP = {
377
  "baseplate/instructor-large-1", # Duplicate
 
438
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
439
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
440
 
441
+ def get_dim_seq_size(model):
442
  filenames = [sib.rfilename for sib in model.siblings]
443
+ dim, seq, size = "", "", ""
444
  if "1_Pooling/config.json" in filenames:
445
  st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
446
  dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
 
453
  if not dim:
454
  dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
455
  seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
456
+ # Get model file size without downloading
457
+ if "pytorch_model.bin" in filenames:
458
+ url = hf_hub_url(model.modelId, filename="pytorch_model.bin")
459
+ meta = get_hf_file_metadata(url)
460
+ size = round(meta.size / 1e9, 2)
461
+ elif "pytorch_model.bin.index.json" in filenames:
462
+ index_path = hf_hub_download(model.modelId, filename="pytorch_model.bin.index.json")
463
+ """
464
+ {
465
+ "metadata": {
466
+ "total_size": 28272820224
467
+ },....
468
+ """
469
+ size = json.load(open(index_path))
470
+ if ("metadata" in size) and ("total_size" in size["metadata"]):
471
+ size = round(size["metadata"]["total_size"] / 1e9, 2)
472
+ return dim, seq, size
473
 
474
  def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
475
  api = HfApi()
 
489
  # Model & at least one result
490
  if len(res) > 1:
491
  if add_emb_dim:
492
+ res["Model Size (GB)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
493
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
494
  res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
495
  df_list.append(res)
 
525
  # Model & at least one result
526
  if len(out) > 1:
527
  if add_emb_dim:
528
+ out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
529
  df_list.append(out)
530
  df = pd.DataFrame(df_list)
531
  # Put 'Model' column first
 
583
  DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
584
  DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
585
 
586
+ DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
587
 
588
  return DATA_OVERALL
589