Spaces:

mteb
/

leaderboard

Running on CPU Upgrade

App Files Files Community

151

eduagarcia commited on May 2, 2024

Commit

6f8ad2f

1 Parent(s): 9066f73

Caches models metadata card to a temporary file to speed up initilization

Browse files

Files changed (2) hide show

.gitignore +2 -1
app.py +21 -10

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- *.pyc


1	+ *.pyc
2	+ model_infos.json

app.py CHANGED Viewed

@@ -151,10 +151,14 @@ def add_rank(df):
     df.fillna("", inplace=True)
     return df
-MODEL_CARD_METADATA = {}
-MODEL_EMB_DIM = {}
 def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
-    global MODEL_CARD_METADATA, MODEL_EMB_DIM
     api = API
     models = api.list_models(filter="mteb")
     # Initialize list to models that we cannot fetch metadata from
@@ -181,11 +185,13 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
     for model in models:
         if model.modelId in MODELS_TO_SKIP: continue
         print("MODEL", model.modelId)
-        if model.modelId not in MODEL_CARD_METADATA or refresh:
             readme_path = hf_hub_download(model.modelId, filename="README.md")
             meta = metadata_load(readme_path)
-            MODEL_CARD_METADATA[model.modelId] = meta
-        meta = MODEL_CARD_METADATA[model.modelId]
         if "model-index" not in meta:
             continue
         # meta['model-index'][0]["results"] is list of elements like:
@@ -217,14 +223,19 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
             if add_emb_dim:
                 try:
                     # Fails on gated repos, so we only include scores for them
-                    if model.modelId not in MODEL_EMB_DIM or refresh:
-                        MODEL_EMB_DIM[model.modelId] = get_dim_seq_size(model)
-                    out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = MODEL_EMB_DIM[model.modelId]
                 except:
-                    MODEL_EMB_DIM[model.modelId] = "", "", "", ""
             df_list.append(out)
         if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
             SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one

     df.fillna("", inplace=True)
     return df
+model_infos_path = "model_infos.json"
+MODEL_INFOS = {}
+if os.path.exists(model_infos_path):
+    with open(model_infos_path) as f:
+        MODEL_INFOS = json.load(f)
 def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True):
+    global MODEL_INFOS
     api = API
     models = api.list_models(filter="mteb")
     # Initialize list to models that we cannot fetch metadata from
     for model in models:
         if model.modelId in MODELS_TO_SKIP: continue
         print("MODEL", model.modelId)
+        if model.modelId not in MODEL_INFOS or refresh:
             readme_path = hf_hub_download(model.modelId, filename="README.md")
             meta = metadata_load(readme_path)
+            MODEL_INFOS[model.modelId] = {
+                "metadata": meta
+            }
+        meta = MODEL_INFOS[model.modelId]["metadata"]
         if "model-index" not in meta:
             continue
         # meta['model-index'][0]["results"] is list of elements like:
             if add_emb_dim:
                 try:
                     # Fails on gated repos, so we only include scores for them
+                    if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh:
+                        MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
+                    out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
                 except:
+                    MODEL_INFOS[model.modelId]["dim_seq_size"] = "", "", "", ""
             df_list.append(out)
         if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}:
             SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
+    # Save & cache MODEL_INFOS
+    with open("model_infos.json", "w") as f:
+        json.dump(MODEL_INFOS, f)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one