Spaces:

HuggingFaceH4
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

746

alozowski commited on 29 days ago

Commit

2e74c81

•

1 Parent(s): 122c7af

bugfix and populate refactoring

Browse files

Files changed (3) hide show

app.py +7 -9
src/envs.py +3 -0
src/populate.py +14 -17

app.py CHANGED Viewed

@@ -87,18 +87,19 @@ def init_space(full_init: bool = True):
         download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
         download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
-    raw_data, leaderboard_df = get_leaderboard_df(
         results_path=EVAL_RESULTS_PATH,
         requests_path=EVAL_REQUESTS_PATH,
         dynamic_path=DYNAMIC_INFO_FILE_PATH,
         cols=COLS,
         benchmark_cols=BENCHMARK_COLS,
     )
-    update_collections(leaderboard_df)
     eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-    return leaderboard_df, raw_data, eval_queue_dfs
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
@@ -107,7 +108,7 @@ do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
-leaderboard_df, raw_data, eval_queue_dfs = init_space(full_init=do_full_init)
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
@@ -335,8 +336,7 @@ with demo:
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                # value=original_df[COLS],
-                value=leaderboard_df[COLS],  # UPDATED
                 headers=COLS,
                 datatype=TYPES,
                 visible=False,
@@ -398,7 +398,6 @@ with demo:
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
-                    # UPDATED
                     plot_df = load_and_create_plots()
                     chart = create_metric_plot_obj(
                         plot_df,
@@ -407,7 +406,6 @@ with demo:
                     )
                     gr.Plot(value=chart, min_width=500)
                 with gr.Column():
-                    # UPDATED
                     plot_df = load_and_create_plots()
                     chart = create_metric_plot_obj(
                         plot_df,

         download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
         download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
+    raw_data, original_df = get_leaderboard_df(
         results_path=EVAL_RESULTS_PATH,
         requests_path=EVAL_REQUESTS_PATH,
         dynamic_path=DYNAMIC_INFO_FILE_PATH,
         cols=COLS,
         benchmark_cols=BENCHMARK_COLS,
     )
+    update_collections(original_df)
+    leaderboard_df = original_df.copy()
     eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    return leaderboard_df, raw_data, original_df, eval_queue_dfs
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
 # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
 # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
 finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=original_df[COLS],
                 headers=COLS,
                 datatype=TYPES,
                 visible=False,
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
                     plot_df = load_and_create_plots()
                     chart = create_metric_plot_obj(
                         plot_df,
                     )
                     gr.Plot(value=chart, min_width=500)
                 with gr.Column():
                     plot_df = load_and_create_plots()
                     chart = create_metric_plot_obj(
                         plot_df,

src/envs.py CHANGED Viewed

@@ -16,6 +16,9 @@ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 CACHE_PATH = os.getenv("HF_HOME", ".")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")

 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 CACHE_PATH = os.getenv("HF_HOME", ".")
+# Check if the CACHE_PATH is a directory and if we have write access, if not set to '.'
+if not os.path.isdir(CACHE_PATH) or not os.access(CACHE_PATH, os.W_OK):
+    CACHE_PATH = "."
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")

src/populate.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
@@ -26,25 +27,20 @@ def _process_model_data(entry, model_name_key="model", revision_key="revision"):
 def get_evaluation_queue_df(save_path, cols):
     """Generate dataframes for pending, running, and finished evaluation entries."""
     all_evals = []
-    entries = os.listdir(save_path)
-    for entry in entries:
-        if entry.startswith(".") or entry.endswith(".md"):
             continue
-        file_path = os.path.join(save_path, entry)
-        if os.path.isfile(file_path):  # Check if it's a file
-            data = _load_json_data(file_path)
-            if data:
-                all_evals.append(_process_model_data(data))
-        else:
-            # Optionally handle directory contents if needed
-            sub_entries = os.listdir(file_path)
-            for sub_entry in sub_entries:
-                sub_file_path = os.path.join(file_path, sub_entry)
-                if os.path.isfile(sub_file_path):
-                    data = _load_json_data(sub_file_path)
-                    if data:
-                        all_evals.append(_process_model_data(data))
     # Organizing data by status
     status_map = {
@@ -72,3 +68,4 @@ def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmar
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df

 import json
 import os
+import pathlib
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 def get_evaluation_queue_df(save_path, cols):
     """Generate dataframes for pending, running, and finished evaluation entries."""
+    save_path = pathlib.Path(save_path)
     all_evals = []
+    for path in save_path.rglob('*'):
+        if path.is_dir():
+            continue
+        if path.name.startswith('.'):
+            continue
+        if path.name.endswith('.md'):
             continue
+        data = _load_json_data(path)
+        if data:
+            all_evals.append(_process_model_data(data))
     # Organizing data by status
     status_map = {
     df = df[cols].round(decimals=2)
     df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df