Alina Lozovskaia commited on
Commit
e34e357
β€’
1 Parent(s): 8ff5577

Updated init_space() mostly

Browse files
Files changed (2) hide show
  1. app.py +44 -47
  2. src/populate.py +3 -3
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -47,6 +48,7 @@ from src.submission.submit import add_new_eval
47
  from src.tools.collections import update_collections
48
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
49
 
 
50
  # Start ephemeral Spaces on PRs (see config in README.md)
51
  enable_space_ci()
52
 
@@ -55,64 +57,48 @@ def restart_space():
55
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
56
 
57
 
58
- def init_space(full_init: bool = True):
59
- if full_init:
60
- try:
61
- print(EVAL_REQUESTS_PATH)
62
- snapshot_download(
63
- repo_id=QUEUE_REPO,
64
- local_dir=EVAL_REQUESTS_PATH,
65
- repo_type="dataset",
66
- tqdm_class=None,
67
- etag_timeout=30,
68
- max_workers=8,
69
- )
70
- except Exception:
71
- restart_space()
72
- try:
73
- print(DYNAMIC_INFO_PATH)
74
- snapshot_download(
75
- repo_id=DYNAMIC_INFO_REPO,
76
- local_dir=DYNAMIC_INFO_PATH,
77
- repo_type="dataset",
78
- tqdm_class=None,
79
- etag_timeout=30,
80
- max_workers=8,
81
- )
82
- except Exception:
83
- restart_space()
84
  try:
85
- print(EVAL_RESULTS_PATH)
86
  snapshot_download(
87
- repo_id=RESULTS_REPO,
88
- local_dir=EVAL_RESULTS_PATH,
89
- repo_type="dataset",
90
  tqdm_class=None,
91
  etag_timeout=30,
92
  max_workers=8,
93
  )
94
- except Exception:
95
- restart_space()
 
 
 
 
 
96
 
97
- raw_data, original_df = get_leaderboard_df(
 
 
 
 
 
 
 
 
98
  results_path=EVAL_RESULTS_PATH,
99
  requests_path=EVAL_REQUESTS_PATH,
100
  dynamic_path=DYNAMIC_INFO_FILE_PATH,
101
  cols=COLS,
102
  benchmark_cols=BENCHMARK_COLS,
103
  )
104
- update_collections(original_df.copy())
105
- leaderboard_df = original_df.copy()
106
 
107
- plot_df = create_plot_df(create_scores_df(raw_data))
108
 
109
- (
110
- finished_eval_queue_df,
111
- running_eval_queue_df,
112
- pending_eval_queue_df,
113
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
114
-
115
- return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
116
 
117
 
118
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
@@ -121,9 +107,14 @@ do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
121
 
122
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
123
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
124
- leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = (
125
- init_space(full_init=do_full_init)
126
- )
 
 
 
 
 
127
 
128
 
129
  # Searching and filtering
@@ -344,7 +335,8 @@ with demo:
344
 
345
  # Dummy leaderboard for handling the case when the user uses backspace key
346
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
347
- value=original_df[COLS],
 
348
  headers=COLS,
349
  datatype=TYPES,
350
  visible=False,
@@ -406,6 +398,8 @@ with demo:
406
  with gr.TabItem("πŸ“ˆ Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
407
  with gr.Row():
408
  with gr.Column():
 
 
409
  chart = create_metric_plot_obj(
410
  plot_df,
411
  [AutoEvalColumn.average.name],
@@ -413,12 +407,15 @@ with demo:
413
  )
414
  gr.Plot(value=chart, min_width=500)
415
  with gr.Column():
 
 
416
  chart = create_metric_plot_obj(
417
  plot_df,
418
  BENCHMARK_COLS,
419
  title="Top Scores and Human Baseline Over Time (from last update)",
420
  )
421
  gr.Plot(value=chart, min_width=500)
 
422
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
423
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
424
 
 
1
  import os
2
+ import logging
3
  import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
48
  from src.tools.collections import update_collections
49
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
50
 
51
+
52
  # Start ephemeral Spaces on PRs (see config in README.md)
53
  enable_space_ci()
54
 
 
57
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
58
 
59
 
60
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3):
61
+ """Attempt to download dataset with retries."""
62
+ attempt = 0
63
+ while attempt < max_attempts:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  try:
65
+ print(f"Downloading {repo_id} to {local_dir}")
66
  snapshot_download(
67
+ repo_id=repo_id,
68
+ local_dir=local_dir,
69
+ repo_type=repo_type,
70
  tqdm_class=None,
71
  etag_timeout=30,
72
  max_workers=8,
73
  )
74
+ return
75
+ except Exception as e:
76
+ logging.error(f"Error downloading {repo_id}: {e}")
77
+ attempt += 1
78
+ if attempt == max_attempts:
79
+ restart_space()
80
+ break
81
 
82
+
83
+ def init_space(full_init: bool = True):
84
+ """Initializes the application space, loading only necessary data."""
85
+ if full_init:
86
+ download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
87
+ download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
88
+ download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
89
+
90
+ raw_data, leaderboard_df = get_leaderboard_df(
91
  results_path=EVAL_RESULTS_PATH,
92
  requests_path=EVAL_REQUESTS_PATH,
93
  dynamic_path=DYNAMIC_INFO_FILE_PATH,
94
  cols=COLS,
95
  benchmark_cols=BENCHMARK_COLS,
96
  )
97
+ update_collections(leaderboard_df)
 
98
 
99
+ eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
100
 
101
+ return leaderboard_df, raw_data, eval_queue_dfs
 
 
 
 
 
 
102
 
103
 
104
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
 
107
 
108
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
109
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
110
+ leaderboard_df, raw_data, eval_queue_dfs = init_space(full_init=do_full_init)
111
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
112
+
113
+
114
+ # Data processing for plots now only on demand in the respective Gradio tab
115
+ def load_and_create_plots():
116
+ plot_df = create_plot_df(create_scores_df(raw_data))
117
+ return plot_df
118
 
119
 
120
  # Searching and filtering
 
335
 
336
  # Dummy leaderboard for handling the case when the user uses backspace key
337
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
338
+ # value=original_df[COLS],
339
+ value=leaderboard_df[COLS], # UPDATED
340
  headers=COLS,
341
  datatype=TYPES,
342
  visible=False,
 
398
  with gr.TabItem("πŸ“ˆ Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
399
  with gr.Row():
400
  with gr.Column():
401
+ # UPDATED
402
+ plot_df = load_and_create_plots()
403
  chart = create_metric_plot_obj(
404
  plot_df,
405
  [AutoEvalColumn.average.name],
 
407
  )
408
  gr.Plot(value=chart, min_width=500)
409
  with gr.Column():
410
+ # UPDATED
411
+ plot_df = load_and_create_plots()
412
  chart = create_metric_plot_obj(
413
  plot_df,
414
  BENCHMARK_COLS,
415
  title="Top Scores and Human Baseline Over Time (from last update)",
416
  )
417
  gr.Plot(value=chart, min_width=500)
418
+
419
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
420
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
421
 
src/populate.py CHANGED
@@ -15,12 +15,12 @@ def get_leaderboard_df(
15
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
16
  all_data_json = [v.to_dict() for v in raw_data]
17
  all_data_json.append(baseline_row)
18
- print([data for data in all_data_json if data["model_name_for_query"] == "databricks/dbrx-base"])
19
  filter_models_flags(all_data_json)
20
 
21
  df = pd.DataFrame.from_records(all_data_json)
22
- print(df.columns)
23
- print(df[df["model_name_for_query"] == "databricks/dbrx-base"])
24
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
25
  df = df[cols].round(decimals=2)
26
 
 
15
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
16
  all_data_json = [v.to_dict() for v in raw_data]
17
  all_data_json.append(baseline_row)
18
+ # print([data for data in all_data_json if data["model_name_for_query"] == "databricks/dbrx-base"])
19
  filter_models_flags(all_data_json)
20
 
21
  df = pd.DataFrame.from_records(all_data_json)
22
+ # print(df.columns)
23
+ # print(df[df["model_name_for_query"] == "databricks/dbrx-base"])
24
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
25
  df = df[cols].round(decimals=2)
26