Clémentine commited on
Commit
9166535
1 Parent(s): 33475fb
Files changed (3) hide show
  1. app.py +22 -260
  2. app_bkp.py +316 -0
  3. src/display/about.py +2 -85
app.py CHANGED
@@ -37,205 +37,33 @@ from src.envs import (
37
  REPO_ID,
38
  HF_HOME,
39
  )
40
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
41
- from src.submission.submit import add_new_eval
42
- from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
43
-
44
- # Configure logging
45
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
46
-
47
-
48
- # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
49
- # This controls whether a full initialization should be performed.
50
- DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
51
- LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
52
-
53
- def restart_space():
54
- API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
55
-
56
-
57
- def time_diff_wrapper(func):
58
- def wrapper(*args, **kwargs):
59
- start_time = time.time()
60
- result = func(*args, **kwargs)
61
- end_time = time.time()
62
- diff = end_time - start_time
63
- logging.info(f"Time taken for {func.__name__}: {diff} seconds")
64
- return result
65
-
66
- return wrapper
67
-
68
-
69
- @time_diff_wrapper
70
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
71
- """Download dataset with exponential backoff retries."""
72
- attempt = 0
73
- while attempt < max_attempts:
74
- try:
75
- logging.info(f"Downloading {repo_id} to {local_dir}")
76
- snapshot_download(
77
- repo_id=repo_id,
78
- local_dir=local_dir,
79
- repo_type=repo_type,
80
- tqdm_class=None,
81
- etag_timeout=30,
82
- max_workers=8,
83
- )
84
- logging.info("Download successful")
85
- return
86
- except Exception as e:
87
- wait_time = backoff_factor**attempt
88
- logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
89
- time.sleep(wait_time)
90
- attempt += 1
91
- raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
92
-
93
- def get_latest_data_leaderboard(leaderboard_initial_df = None):
94
- current_time = datetime.datetime.now()
95
- global LAST_UPDATE_LEADERBOARD
96
- if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
97
- return leaderboard_initial_df
98
- LAST_UPDATE_LEADERBOARD = current_time
99
- leaderboard_dataset = datasets.load_dataset(
100
- AGGREGATED_REPO,
101
- "default",
102
- split="train",
103
- cache_dir=HF_HOME,
104
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
105
- verification_mode="no_checks"
106
- )
107
-
108
- leaderboard_df = get_leaderboard_df(
109
- leaderboard_dataset=leaderboard_dataset,
110
- cols=COLS,
111
- benchmark_cols=BENCHMARK_COLS,
112
- )
113
-
114
- return leaderboard_df
115
-
116
- def get_latest_data_queue():
117
- eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
118
- return eval_queue_dfs
119
-
120
- def init_space():
121
- """Initializes the application space, loading only necessary data."""
122
- if DO_FULL_INIT:
123
- # These downloads only occur on full initialization
124
- try:
125
- download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
126
- except Exception:
127
- restart_space()
128
-
129
- # Always redownload the leaderboard DataFrame
130
- leaderboard_df = get_latest_data_leaderboard()
131
-
132
- # Evaluation queue DataFrame retrieval is independent of initialization detail level
133
- eval_queue_dfs = get_latest_data_queue()
134
-
135
- return leaderboard_df, eval_queue_dfs
136
-
137
-
138
- # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
139
- # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
140
- leaderboard_df, eval_queue_dfs = init_space()
141
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
142
-
143
-
144
- # Data processing for plots now only on demand in the respective Gradio tab
145
- def load_and_create_plots():
146
- plot_df = create_plot_df(create_scores_df(leaderboard_df))
147
- return plot_df
148
-
149
- def init_leaderboard(dataframe):
150
- return Leaderboard(
151
- value = dataframe,
152
- datatype=[c.type for c in fields(AutoEvalColumn)],
153
- select_columns=SelectColumns(
154
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
155
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
156
- label="Select Columns to Display:",
157
- ),
158
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
159
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
160
- filter_columns=[
161
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
162
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
163
- ColumnFilter(
164
- AutoEvalColumn.params.name,
165
- type="slider",
166
- min=0.01,
167
- max=150,
168
- label="Select the number of parameters (B)",
169
- ),
170
- ColumnFilter(
171
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
172
- ),
173
- ColumnFilter(
174
- AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
175
- ),
176
- ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
177
- ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
178
- ],
179
- bool_checkboxgroup_label="Hide models",
180
- interactive=False,
181
- )
182
 
183
  demo = gr.Blocks(css=custom_css)
184
  with demo:
185
  gr.HTML(TITLE)
186
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
187
 
188
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
189
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
190
- leaderboard = init_leaderboard(leaderboard_df)
191
-
192
- with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
193
- with gr.Row():
194
- with gr.Column():
195
- plot_df = load_and_create_plots()
196
- chart = create_metric_plot_obj(
197
- plot_df,
198
- [AutoEvalColumn.average.name],
199
- title="Average of Top Scores and Human Baseline Over Time (from last update)",
200
- )
201
- gr.Plot(value=chart, min_width=500)
202
- with gr.Column():
203
- plot_df = load_and_create_plots()
204
- chart = create_metric_plot_obj(
205
- plot_df,
206
- BENCHMARK_COLS,
207
- title="Top Scores and Human Baseline Over Time (from last update)",
208
- )
209
- gr.Plot(value=chart, min_width=500)
210
-
211
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
212
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
213
-
214
- with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
215
- gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
216
-
217
- with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
218
- countdown = gr.HTML(
219
- """<div align="center">
220
- <div position: relative>
221
- <img
222
- src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
223
- allowtransparency="true"
224
- style="display:block;width:100%;height:auto;"
225
- />
226
- <iframe
227
- src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&amp;timezone=Europe%2FParis&amp;width=&amp;style=circles&amp;uid=815898&amp;loc=https://logwork.com/countdown-fxmc&amp;language=en&amp;textcolor=&amp;background=%23ffd21e&amp;date=2024-06-26%2015%3A00%3A00&amp;digitscolor=%23ff9d00&amp;unitscolor=&amp"
228
- style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
229
- scrolling="no"
230
- allowtransparency="true"
231
- frameborder="0"
232
- allowfullscreen
233
- />
234
- </div>
235
- </div>"""
236
- )
237
- #gif = gr.Image(value="./gif.gif", interactive=False)
238
- gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
239
 
240
  with gr.Row():
241
  with gr.Accordion("📙 Citation", open=False):
@@ -247,70 +75,4 @@ with demo:
247
  show_copy_button=True,
248
  )
249
 
250
- demo.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
251
-
252
-
253
- demo.queue(default_concurrency_limit=40)
254
-
255
- # Start ephemeral Spaces on PRs (see config in README.md)
256
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
257
-
258
- def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
259
- # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
260
- # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
261
- # ht to Lucain!
262
- if SPACE_ID is None:
263
- print("Not in a Space: Space CI disabled.")
264
- return WebhooksServer(ui=demo)
265
-
266
- if IS_EPHEMERAL_SPACE:
267
- print("In an ephemeral Space: Space CI disabled.")
268
- return WebhooksServer(ui=demo)
269
-
270
- card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
271
- config = card.data.get("space_ci", {})
272
- print(f"Enabling Space CI with config from README: {config}")
273
-
274
- return configure_space_ci(
275
- blocks=ui,
276
- trusted_authors=config.get("trusted_authors"),
277
- private=config.get("private", "auto"),
278
- variables=config.get("variables", "auto"),
279
- secrets=config.get("secrets"),
280
- hardware=config.get("hardware"),
281
- storage=config.get("storage"),
282
- )
283
-
284
- # Create webhooks server (with CI url if in Space and not ephemeral)
285
- webhooks_server = enable_space_ci_and_return_server(ui=demo)
286
-
287
- # Add webhooks
288
- @webhooks_server.add_webhook
289
- def update_leaderboard(payload: WebhookPayload) -> None:
290
- """Redownloads the leaderboard dataset each time it updates"""
291
- if payload.repo.type == "dataset" and payload.event.action == "update":
292
- datasets.load_dataset(
293
- AGGREGATED_REPO,
294
- "default",
295
- split="train",
296
- cache_dir=HF_HOME,
297
- download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
298
- verification_mode="no_checks"
299
- )
300
-
301
- # The below code is not used at the moment, as we can manage the queue file locally
302
- LAST_UPDATE_QUEUE = datetime.datetime.now()
303
- @webhooks_server.add_webhook
304
- def update_queue(payload: WebhookPayload) -> None:
305
- """Redownloads the queue dataset each time it updates"""
306
- if payload.repo.type == "dataset" and payload.event.action == "update":
307
- current_time = datetime.datetime.now()
308
- global LAST_UPDATE_QUEUE
309
- if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
310
- print("Would have updated the queue")
311
- # We only redownload is last update was more than 10 minutes ago, as the queue is
312
- # updated regularly and heavy to download
313
- #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
314
- LAST_UPDATE_QUEUE = datetime.datetime.now()
315
-
316
- webhooks_server.launch()
 
37
  REPO_ID,
38
  HF_HOME,
39
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  demo = gr.Blocks(css=custom_css)
42
  with demo:
43
  gr.HTML(TITLE)
44
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
45
 
46
+ countdown = gr.HTML(
47
+ """<div align="center">
48
+ <div position: relative>
49
+ <img
50
+ src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
51
+ allowtransparency="true"
52
+ style="display:block;width:100%;height:auto;"
53
+ />
54
+ <iframe
55
+ src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&amp;timezone=Europe%2FParis&amp;width=&amp;style=circles&amp;uid=815898&amp;loc=https://logwork.com/countdown-fxmc&amp;language=en&amp;textcolor=&amp;background=%23ffd21e&amp;date=2024-06-26%2015%3A00%3A00&amp;digitscolor=%23ff9d00&amp;unitscolor=&amp"
56
+ style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
57
+ scrolling="no"
58
+ allowtransparency="true"
59
+ frameborder="0"
60
+ allowfullscreen
61
+ />
62
+ </div>
63
+ </div>"""
64
+ )
65
+ #gif = gr.Image(value="./gif.gif", interactive=False)
66
+ gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  with gr.Row():
69
  with gr.Accordion("📙 Citation", open=False):
 
75
  show_copy_button=True,
76
  )
77
 
78
+ demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_bkp.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import time
4
+ import datetime
5
+ import gradio as gr
6
+ import datasets
7
+ from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
8
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
9
+
10
+ from src.display.about import (
11
+ CITATION_BUTTON_LABEL,
12
+ CITATION_BUTTON_TEXT,
13
+ EVALUATION_QUEUE_TEXT,
14
+ FAQ_TEXT,
15
+ INTRODUCTION_TEXT,
16
+ LLM_BENCHMARKS_TEXT,
17
+ TITLE,
18
+ )
19
+ from src.display.css_html_js import custom_css
20
+ from src.display.utils import (
21
+ BENCHMARK_COLS,
22
+ COLS,
23
+ EVAL_COLS,
24
+ EVAL_TYPES,
25
+ AutoEvalColumn,
26
+ ModelType,
27
+ Precision,
28
+ WeightType,
29
+ fields,
30
+ )
31
+ from src.envs import (
32
+ API,
33
+ EVAL_REQUESTS_PATH,
34
+ AGGREGATED_REPO,
35
+ HF_TOKEN,
36
+ QUEUE_REPO,
37
+ REPO_ID,
38
+ HF_HOME,
39
+ )
40
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
41
+ from src.submission.submit import add_new_eval
42
+ from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
43
+
44
+ # Configure logging
45
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
46
+
47
+
48
+ # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
49
+ # This controls whether a full initialization should be performed.
50
+ DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
51
+ LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
52
+
53
+ def restart_space():
54
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
55
+
56
+
57
+ def time_diff_wrapper(func):
58
+ def wrapper(*args, **kwargs):
59
+ start_time = time.time()
60
+ result = func(*args, **kwargs)
61
+ end_time = time.time()
62
+ diff = end_time - start_time
63
+ logging.info(f"Time taken for {func.__name__}: {diff} seconds")
64
+ return result
65
+
66
+ return wrapper
67
+
68
+
69
+ @time_diff_wrapper
70
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
71
+ """Download dataset with exponential backoff retries."""
72
+ attempt = 0
73
+ while attempt < max_attempts:
74
+ try:
75
+ logging.info(f"Downloading {repo_id} to {local_dir}")
76
+ snapshot_download(
77
+ repo_id=repo_id,
78
+ local_dir=local_dir,
79
+ repo_type=repo_type,
80
+ tqdm_class=None,
81
+ etag_timeout=30,
82
+ max_workers=8,
83
+ )
84
+ logging.info("Download successful")
85
+ return
86
+ except Exception as e:
87
+ wait_time = backoff_factor**attempt
88
+ logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
89
+ time.sleep(wait_time)
90
+ attempt += 1
91
+ raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
92
+
93
+ def get_latest_data_leaderboard(leaderboard_initial_df = None):
94
+ current_time = datetime.datetime.now()
95
+ global LAST_UPDATE_LEADERBOARD
96
+ if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
97
+ return leaderboard_initial_df
98
+ LAST_UPDATE_LEADERBOARD = current_time
99
+ leaderboard_dataset = datasets.load_dataset(
100
+ AGGREGATED_REPO,
101
+ "default",
102
+ split="train",
103
+ cache_dir=HF_HOME,
104
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
105
+ verification_mode="no_checks"
106
+ )
107
+
108
+ leaderboard_df = get_leaderboard_df(
109
+ leaderboard_dataset=leaderboard_dataset,
110
+ cols=COLS,
111
+ benchmark_cols=BENCHMARK_COLS,
112
+ )
113
+
114
+ return leaderboard_df
115
+
116
+ def get_latest_data_queue():
117
+ eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
118
+ return eval_queue_dfs
119
+
120
+ def init_space():
121
+ """Initializes the application space, loading only necessary data."""
122
+ if DO_FULL_INIT:
123
+ # These downloads only occur on full initialization
124
+ try:
125
+ download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
126
+ except Exception:
127
+ restart_space()
128
+
129
+ # Always redownload the leaderboard DataFrame
130
+ leaderboard_df = get_latest_data_leaderboard()
131
+
132
+ # Evaluation queue DataFrame retrieval is independent of initialization detail level
133
+ eval_queue_dfs = get_latest_data_queue()
134
+
135
+ return leaderboard_df, eval_queue_dfs
136
+
137
+
138
+ # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
139
+ # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
140
+ leaderboard_df, eval_queue_dfs = init_space()
141
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
142
+
143
+
144
+ # Data processing for plots now only on demand in the respective Gradio tab
145
+ def load_and_create_plots():
146
+ plot_df = create_plot_df(create_scores_df(leaderboard_df))
147
+ return plot_df
148
+
149
+ def init_leaderboard(dataframe):
150
+ return Leaderboard(
151
+ value = dataframe,
152
+ datatype=[c.type for c in fields(AutoEvalColumn)],
153
+ select_columns=SelectColumns(
154
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
155
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
156
+ label="Select Columns to Display:",
157
+ ),
158
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
159
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
160
+ filter_columns=[
161
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
162
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
163
+ ColumnFilter(
164
+ AutoEvalColumn.params.name,
165
+ type="slider",
166
+ min=0.01,
167
+ max=150,
168
+ label="Select the number of parameters (B)",
169
+ ),
170
+ ColumnFilter(
171
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
172
+ ),
173
+ ColumnFilter(
174
+ AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
175
+ ),
176
+ ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
177
+ ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
178
+ ],
179
+ bool_checkboxgroup_label="Hide models",
180
+ interactive=False,
181
+ )
182
+
183
+ demo = gr.Blocks(css=custom_css)
184
+ with demo:
185
+ gr.HTML(TITLE)
186
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
187
+
188
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
189
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
190
+ leaderboard = init_leaderboard(leaderboard_df)
191
+
192
+ with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
193
+ with gr.Row():
194
+ with gr.Column():
195
+ plot_df = load_and_create_plots()
196
+ chart = create_metric_plot_obj(
197
+ plot_df,
198
+ [AutoEvalColumn.average.name],
199
+ title="Average of Top Scores and Human Baseline Over Time (from last update)",
200
+ )
201
+ gr.Plot(value=chart, min_width=500)
202
+ with gr.Column():
203
+ plot_df = load_and_create_plots()
204
+ chart = create_metric_plot_obj(
205
+ plot_df,
206
+ BENCHMARK_COLS,
207
+ title="Top Scores and Human Baseline Over Time (from last update)",
208
+ )
209
+ gr.Plot(value=chart, min_width=500)
210
+
211
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
212
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
213
+
214
+ with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
215
+ gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
216
+
217
+ with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
218
+ countdown = gr.HTML(
219
+ """<div align="center">
220
+ <div position: relative>
221
+ <img
222
+ src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
223
+ allowtransparency="true"
224
+ style="display:block;width:100%;height:auto;"
225
+ />
226
+ <iframe
227
+ src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&amp;timezone=Europe%2FParis&amp;width=&amp;style=circles&amp;uid=815898&amp;loc=https://logwork.com/countdown-fxmc&amp;language=en&amp;textcolor=&amp;background=%23ffd21e&amp;date=2024-06-26%2015%3A00%3A00&amp;digitscolor=%23ff9d00&amp;unitscolor=&amp"
228
+ style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
229
+ scrolling="no"
230
+ allowtransparency="true"
231
+ frameborder="0"
232
+ allowfullscreen
233
+ />
234
+ </div>
235
+ </div>"""
236
+ )
237
+ #gif = gr.Image(value="./gif.gif", interactive=False)
238
+ gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
239
+
240
+ with gr.Row():
241
+ with gr.Accordion("📙 Citation", open=False):
242
+ citation_button = gr.Textbox(
243
+ value=CITATION_BUTTON_TEXT,
244
+ label=CITATION_BUTTON_LABEL,
245
+ lines=20,
246
+ elem_id="citation-button",
247
+ show_copy_button=True,
248
+ )
249
+
250
+ demo.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
251
+
252
+
253
+ demo.queue(default_concurrency_limit=40)
254
+
255
+ # Start ephemeral Spaces on PRs (see config in README.md)
256
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
257
+
258
+ def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
259
+ # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
260
+ # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
261
+ # ht to Lucain!
262
+ if SPACE_ID is None:
263
+ print("Not in a Space: Space CI disabled.")
264
+ return WebhooksServer(ui=demo)
265
+
266
+ if IS_EPHEMERAL_SPACE:
267
+ print("In an ephemeral Space: Space CI disabled.")
268
+ return WebhooksServer(ui=demo)
269
+
270
+ card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
271
+ config = card.data.get("space_ci", {})
272
+ print(f"Enabling Space CI with config from README: {config}")
273
+
274
+ return configure_space_ci(
275
+ blocks=ui,
276
+ trusted_authors=config.get("trusted_authors"),
277
+ private=config.get("private", "auto"),
278
+ variables=config.get("variables", "auto"),
279
+ secrets=config.get("secrets"),
280
+ hardware=config.get("hardware"),
281
+ storage=config.get("storage"),
282
+ )
283
+
284
+ # Create webhooks server (with CI url if in Space and not ephemeral)
285
+ webhooks_server = enable_space_ci_and_return_server(ui=demo)
286
+
287
+ # Add webhooks
288
+ @webhooks_server.add_webhook
289
+ def update_leaderboard(payload: WebhookPayload) -> None:
290
+ """Redownloads the leaderboard dataset each time it updates"""
291
+ if payload.repo.type == "dataset" and payload.event.action == "update":
292
+ datasets.load_dataset(
293
+ AGGREGATED_REPO,
294
+ "default",
295
+ split="train",
296
+ cache_dir=HF_HOME,
297
+ download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
298
+ verification_mode="no_checks"
299
+ )
300
+
301
+ # The below code is not used at the moment, as we can manage the queue file locally
302
+ LAST_UPDATE_QUEUE = datetime.datetime.now()
303
+ @webhooks_server.add_webhook
304
+ def update_queue(payload: WebhookPayload) -> None:
305
+ """Redownloads the queue dataset each time it updates"""
306
+ if payload.repo.type == "dataset" and payload.event.action == "update":
307
+ current_time = datetime.datetime.now()
308
+ global LAST_UPDATE_QUEUE
309
+ if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
310
+ print("Would have updated the queue")
311
+ # We only redownload is last update was more than 10 minutes ago, as the queue is
312
+ # updated regularly and heavy to download
313
+ #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
314
+ LAST_UPDATE_QUEUE = datetime.datetime.now()
315
+
316
+ webhooks_server.launch()
src/display/about.py CHANGED
@@ -219,89 +219,6 @@ CITATION_BUTTON_TEXT = r"""
219
  publisher = {Hugging Face},
220
  howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
221
  }
222
- @software{eval-harness,
223
- author = {Gao, Leo and
224
- Tow, Jonathan and
225
- Biderman, Stella and
226
- Black, Sid and
227
- DiPofi, Anthony and
228
- Foster, Charles and
229
- Golding, Laurence and
230
- Hsu, Jeffrey and
231
- McDonell, Kyle and
232
- Muennighoff, Niklas and
233
- Phang, Jason and
234
- Reynolds, Laria and
235
- Tang, Eric and
236
- Thite, Anish and
237
- Wang, Ben and
238
- Wang, Kevin and
239
- Zou, Andy},
240
- title = {A framework for few-shot language model evaluation},
241
- month = sep,
242
- year = 2021,
243
- publisher = {Zenodo},
244
- version = {v0.0.1},
245
- doi = {10.5281/zenodo.5371628},
246
- url = {https://doi.org/10.5281/zenodo.5371628}
247
- }
248
- @misc{clark2018think,
249
- title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
250
- author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
251
- year={2018},
252
- eprint={1803.05457},
253
- archivePrefix={arXiv},
254
- primaryClass={cs.AI}
255
- }
256
- @misc{zellers2019hellaswag,
257
- title={HellaSwag: Can a Machine Really Finish Your Sentence?},
258
- author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
259
- year={2019},
260
- eprint={1905.07830},
261
- archivePrefix={arXiv},
262
- primaryClass={cs.CL}
263
- }
264
- @misc{hendrycks2021measuring,
265
- title={Measuring Massive Multitask Language Understanding},
266
- author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
267
- year={2021},
268
- eprint={2009.03300},
269
- archivePrefix={arXiv},
270
- primaryClass={cs.CY}
271
- }
272
- @misc{lin2022truthfulqa,
273
- title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
274
- author={Stephanie Lin and Jacob Hilton and Owain Evans},
275
- year={2022},
276
- eprint={2109.07958},
277
- archivePrefix={arXiv},
278
- primaryClass={cs.CL}
279
- }
280
- @misc{DBLP:journals/corr/abs-1907-10641,
281
- title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
282
- author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
283
- year={2019},
284
- eprint={1907.10641},
285
- archivePrefix={arXiv},
286
- primaryClass={cs.CL}
287
- }
288
- @misc{DBLP:journals/corr/abs-2110-14168,
289
- title={Training Verifiers to Solve Math Word Problems},
290
- author={Karl Cobbe and
291
- Vineet Kosaraju and
292
- Mohammad Bavarian and
293
- Mark Chen and
294
- Heewoo Jun and
295
- Lukasz Kaiser and
296
- Matthias Plappert and
297
- Jerry Tworek and
298
- Jacob Hilton and
299
- Reiichiro Nakano and
300
- Christopher Hesse and
301
- John Schulman},
302
- year={2021},
303
- eprint={2110.14168},
304
- archivePrefix={arXiv},
305
- primaryClass={cs.CL}
306
- }
307
  """
 
219
  publisher = {Hugging Face},
220
  howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
221
  }
222
+
223
+ ????
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  """