Spaces:

autoevaluate
/

leaderboards

Running

App Files Files Community

Tristan Thrush commited on Jun 10, 2022

Commit

23ca923

•

1 Parent(s): 30f749f

removed requirement to be from autoeval org

Browse files

Files changed (1) hide show

app.py +31 -30

app.py CHANGED Viewed

@@ -45,30 +45,24 @@ def parse_metric_value(value):
     return value
-def parse_metrics_rows(meta, from_autoeval=False):
     if not isinstance(meta["model-index"], list) or len(meta["model-index"]) == 0 or "results" not in meta["model-index"][0]:
         return None
     for result in meta["model-index"][0]["results"]:
         if not isinstance(result, dict) or "dataset" not in result or "metrics" not in result or "type" not in result["dataset"]:
             continue
         dataset = result["dataset"]["type"]
-        row = {"dataset": dataset, "split": "-unspecified-", "config": "-unspecified-", "verified": from_autoeval}
         if "split" in result["dataset"]:
             row["split"] = result["dataset"]["split"]
         if "config" in result["dataset"]:
             row["config"] = result["dataset"]["config"]
         no_results = True
         for metric in result["metrics"]:
-            # On autoeval cards, name is consistent. name seems less consistent than
-            # type for self-reported results on user model cards though.
-            if from_autoeval:
-                name = metric["name"].lower().strip()
-            else:
-                name = metric["type"].lower().strip()
-            if name in ("model_id", "dataset", "split", "config", "verified"):
-                # Metrics are not allowed to be named "dataset", "split", "config", or "verified".
                 continue
             value = parse_metric_value(metric.get("value", None))
             if value is None:
@@ -78,10 +72,7 @@ def parse_metrics_rows(meta, from_autoeval=False):
             if name not in row or new_metric_better:
                 # overwrite the metric if the new value is better.
-                if from_autoeval:
-                    # if the metric is from autoeval, only include it in the leaderboard if
-                    # it is a verified metric. Unverified metrics are already included
-                    # in the leaderboard from the unverified model card.
                     if "verified" in metric and metric["verified"]:
                         no_results = False
                         row[name] = value
@@ -97,52 +88,65 @@ def get_data_wrapper():
     def get_data():
         data = []
-        model_ids = get_model_ids()
         model_ids_from_autoeval = set(get_model_ids(author="autoevaluate"))
         for model_id in tqdm(model_ids):
             meta = get_metadata(model_id)
             if meta is None:
                 continue
-            for row in parse_metrics_rows(meta, from_autoeval=model_id in model_ids_from_autoeval):
                 if row is None:
                     continue
                 row["model_id"] = model_id
                 data.append(row)
         dataframe = pd.DataFrame.from_records(data)
         dataframe.to_pickle("cache.pkl")
-    if exists("cache.pkl"):
         # If we have saved the results previously, call an asynchronous process
         # to fetch the results and update the saved file. Don't make users wait
         # while we fetch the new results. Instead, display the old results for
         # now. The new results should be loaded when this method
         # is called again.
         dataframe = pd.read_pickle("cache.pkl")
         t = threading.Thread(name='get_data procs', target=get_data)
         t.start()
     else:
         # We have to make the users wait during the first startup of this app.
         get_data()
         dataframe = pd.read_pickle("cache.pkl")
-    return dataframe
-dataframe = get_data_wrapper()
-selectable_datasets = list(set(dataframe.dataset.tolist()))
 st.markdown("# 🤗 Leaderboards")
 query_params = st.experimental_get_query_params()
 default_dataset = "common_voice"
 if "dataset" in query_params:
     if len(query_params["dataset"]) > 0 and query_params["dataset"][0] in selectable_datasets:
         default_dataset = query_params["dataset"][0]
-only_verified_results = st.sidebar.checkbox(
-    "Filter for Verified Results",
-)
 dataset = st.sidebar.selectbox(
     "Dataset",
     selectable_datasets,
@@ -154,9 +158,6 @@ st.experimental_set_query_params(**{"dataset": [dataset]})
 dataset_df = dataframe[dataframe.dataset == dataset]
 dataset_df = dataset_df.dropna(axis="columns", how="all")
-if only_verified_results:
-    dataset_df = dataset_df[dataset_df["verified"]]
 selectable_configs = list(set(dataset_df["config"]))
 config = st.sidebar.selectbox(
     "Config",
@@ -171,7 +172,7 @@ split = st.sidebar.selectbox(
 )
 dataset_df = dataset_df[dataset_df.split == split]
-selectable_metrics = list(filter(lambda column: column not in ("model_id", "dataset", "split", "config", "verified"), dataset_df.columns))
 dataset_df = dataset_df.filter(["model_id"] + selectable_metrics)
 dataset_df = dataset_df.dropna(thresh=2)  # Want at least two non-na values (one for model_id and one for a metric).

     return value
+def parse_metrics_rows(meta, only_verified=False):
     if not isinstance(meta["model-index"], list) or len(meta["model-index"]) == 0 or "results" not in meta["model-index"][0]:
         return None
     for result in meta["model-index"][0]["results"]:
         if not isinstance(result, dict) or "dataset" not in result or "metrics" not in result or "type" not in result["dataset"]:
             continue
         dataset = result["dataset"]["type"]
+        row = {"dataset": dataset, "split": "-unspecified-", "config": "-unspecified-"}
         if "split" in result["dataset"]:
             row["split"] = result["dataset"]["split"]
         if "config" in result["dataset"]:
             row["config"] = result["dataset"]["config"]
         no_results = True
         for metric in result["metrics"]:
+            name = metric["type"].lower().strip()
+            if name in ("model_id", "dataset", "split", "config"):
+                # Metrics are not allowed to be named "dataset", "split", "config".
                 continue
             value = parse_metric_value(metric.get("value", None))
             if value is None:
             if name not in row or new_metric_better:
                 # overwrite the metric if the new value is better.
+                if only_verified:
                     if "verified" in metric and metric["verified"]:
                         no_results = False
                         row[name] = value
     def get_data():
         data = []
+        verified_data = []
+        model_ids = get_model_ids()[:100]
         model_ids_from_autoeval = set(get_model_ids(author="autoevaluate"))
         for model_id in tqdm(model_ids):
             meta = get_metadata(model_id)
             if meta is None:
                 continue
+            for row in parse_metrics_rows(meta):
                 if row is None:
                     continue
                 row["model_id"] = model_id
                 data.append(row)
+            for row in parse_metrics_rows(meta, only_verified=True):
+                if row is None:
+                    continue
+                row["model_id"] = model_id
+                verified_data.append(row)
         dataframe = pd.DataFrame.from_records(data)
         dataframe.to_pickle("cache.pkl")
+        verified_dataframe = pd.DataFrame.from_records(verified_data)
+        verified_dataframe.to_pickle("verified_cache.pkl")
+    if exists("cache.pkl") and exists("verified_cache.pkl"):
         # If we have saved the results previously, call an asynchronous process
         # to fetch the results and update the saved file. Don't make users wait
         # while we fetch the new results. Instead, display the old results for
         # now. The new results should be loaded when this method
         # is called again.
         dataframe = pd.read_pickle("cache.pkl")
+        verified_dataframe = pd.read_pickle("verified_cache.pkl")
         t = threading.Thread(name='get_data procs', target=get_data)
         t.start()
     else:
         # We have to make the users wait during the first startup of this app.
         get_data()
         dataframe = pd.read_pickle("cache.pkl")
+        verified_dataframe = pd.read_pickle("verified_cache.pkl")
+    return dataframe, verified_dataframe
+dataframe, verified_dataframe = get_data_wrapper()
 st.markdown("# 🤗 Leaderboards")
+only_verified_results = st.sidebar.checkbox(
+    "Filter for Verified Results",
+)
+if only_verified_results:
+    dataframe = verified_dataframe
+selectable_datasets = list(set(dataframe.dataset.tolist()))
 query_params = st.experimental_get_query_params()
 default_dataset = "common_voice"
 if "dataset" in query_params:
     if len(query_params["dataset"]) > 0 and query_params["dataset"][0] in selectable_datasets:
         default_dataset = query_params["dataset"][0]
 dataset = st.sidebar.selectbox(
     "Dataset",
     selectable_datasets,
 dataset_df = dataframe[dataframe.dataset == dataset]
 dataset_df = dataset_df.dropna(axis="columns", how="all")
 selectable_configs = list(set(dataset_df["config"]))
 config = st.sidebar.selectbox(
     "Config",
 )
 dataset_df = dataset_df[dataset_df.split == split]
+selectable_metrics = list(filter(lambda column: column not in ("model_id", "dataset", "split", "config"), dataset_df.columns))
 dataset_df = dataset_df.filter(["model_id"] + selectable_metrics)
 dataset_df = dataset_df.dropna(thresh=2)  # Want at least two non-na values (one for model_id and one for a metric).