saattrupdan commited on
Commit
3baf99a
1 Parent(s): 8b5abf6

feat: Update with new results every 30 mins

Browse files
Files changed (1) hide show
  1. app.py +81 -42
app.py CHANGED
@@ -10,6 +10,15 @@ from pydantic import BaseModel
10
  import gradio as gr
11
  import requests
12
  import random
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  class Task(BaseModel):
@@ -130,46 +139,9 @@ DATASETS = [
130
  def main() -> None:
131
  """Produce a radial plot."""
132
 
133
- # Download all the newest records
134
- response = requests.get("https://scandeval.com/scandeval_benchmark_results.jsonl")
135
- response.raise_for_status()
136
- records = [
137
- json.loads(dct_str)
138
- for dct_str in response.text.split("\n")
139
- if dct_str.strip("\n")
140
- ]
141
-
142
- # Build a dictionary of languages -> results-dataframes, whose indices are the
143
- # models and columns are the tasks.
144
- results_dfs = dict()
145
- for language in {dataset.language for dataset in DATASETS}:
146
- possible_dataset_names = {
147
- dataset.name for dataset in DATASETS if dataset.language == language
148
- }
149
- data_dict = defaultdict(dict)
150
- for record in records:
151
- model_name = record["model"]
152
- dataset_name = record["dataset"]
153
- if dataset_name in possible_dataset_names:
154
- dataset = next(
155
- dataset for dataset in DATASETS if dataset.name == dataset_name
156
- )
157
- results_dict = record['results']['total']
158
- score = results_dict.get(
159
- f"test_{dataset.task.metric}", results_dict.get(dataset.task.metric)
160
- )
161
- if dataset.task in data_dict[model_name]:
162
- data_dict[model_name][dataset.task].append(score)
163
- else:
164
- data_dict[model_name][dataset.task] = [score]
165
- results_df = pd.DataFrame(data_dict).T.map(
166
- lambda list_or_nan:
167
- np.mean(list_or_nan) if list_or_nan == list_or_nan else list_or_nan
168
- ).dropna()
169
- if any(task not in results_df.columns for task in ALL_TASKS):
170
- results_dfs[language] = pd.DataFrame()
171
- else:
172
- results_dfs[language] = results_df
173
 
174
  all_languages: list[str | int | float | tuple[str, str | int | float]] | None = [
175
  language.name for language in ALL_LANGUAGES.values()
@@ -251,7 +223,6 @@ def main() -> None:
251
  outputs=plot,
252
  )
253
 
254
-
255
  demo.launch()
256
 
257
 
@@ -272,6 +243,8 @@ def update_model_ids_dropdown(
272
  if results_dfs is None or len(language_names) == 0:
273
  return gr.update(choices=[], value=[])
274
 
 
 
275
  filtered_results_dfs = {
276
  language: df
277
  for language, df in results_dfs.items()
@@ -300,7 +273,7 @@ def produce_radial_plot(
300
  model_ids: list[str],
301
  language_names: list[str],
302
  use_win_ratio: bool,
303
- results_dfs: dict[Language, pd.DataFrame] | None
304
  ) -> go.Figure:
305
  """Produce a radial plot as a plotly figure.
306
 
@@ -320,6 +293,17 @@ def produce_radial_plot(
320
  if results_dfs is None or len(language_names) == 0 or len(model_ids) == 0:
321
  return go.Figure()
322
 
 
 
 
 
 
 
 
 
 
 
 
323
  tasks = ALL_TASKS
324
  languages = [ALL_LANGUAGES[language_name] for language_name in language_names]
325
 
@@ -386,7 +370,62 @@ def produce_radial_plot(
386
  polar=dict(radialaxis=dict(visible=True)), showlegend=True, title=title
387
  )
388
 
 
 
389
  return fig
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  if __name__ == "__main__":
392
  main()
 
10
  import gradio as gr
11
  import requests
12
  import random
13
+ import logging
14
+ import datetime as dt
15
+
16
+
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger("radial_plot_generator")
19
+
20
+
21
+ UPDATE_FREQUENCY_MINUTES = 30
22
 
23
 
24
  class Task(BaseModel):
 
139
  def main() -> None:
140
  """Produce a radial plot."""
141
 
142
+ global last_fetch
143
+ results_dfs = fetch_results()
144
+ last_fetch = dt.datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  all_languages: list[str | int | float | tuple[str, str | int | float]] | None = [
147
  language.name for language in ALL_LANGUAGES.values()
 
223
  outputs=plot,
224
  )
225
 
 
226
  demo.launch()
227
 
228
 
 
243
  if results_dfs is None or len(language_names) == 0:
244
  return gr.update(choices=[], value=[])
245
 
246
+ # Download the newest records if it has been more than 5 minutes since the last
247
+
248
  filtered_results_dfs = {
249
  language: df
250
  for language, df in results_dfs.items()
 
273
  model_ids: list[str],
274
  language_names: list[str],
275
  use_win_ratio: bool,
276
+ results_dfs: dict[Language, pd.DataFrame] | None,
277
  ) -> go.Figure:
278
  """Produce a radial plot as a plotly figure.
279
 
 
293
  if results_dfs is None or len(language_names) == 0 or len(model_ids) == 0:
294
  return go.Figure()
295
 
296
+ global last_fetch
297
+ minutes_since_last_fetch = (dt.datetime.now() - last_fetch).total_seconds() / 60
298
+ if minutes_since_last_fetch > UPDATE_FREQUENCY_MINUTES:
299
+ results_dfs = fetch_results()
300
+ last_fetch = dt.datetime.now()
301
+
302
+ logger.info(
303
+ f"Producing radial plot for models {model_ids!r} on languages "
304
+ f"{language_names!r}..."
305
+ )
306
+
307
  tasks = ALL_TASKS
308
  languages = [ALL_LANGUAGES[language_name] for language_name in language_names]
309
 
 
370
  polar=dict(radialaxis=dict(visible=True)), showlegend=True, title=title
371
  )
372
 
373
+ logger.info("Successfully produced radial plot.")
374
+
375
  return fig
376
 
377
+ def fetch_results() -> dict[Language, pd.DataFrame]:
378
+ """Fetch the results from the ScandEval benchmark.
379
+
380
+ Returns:
381
+ A dictionary of languages -> results-dataframes, whose indices are the
382
+ models and columns are the tasks.
383
+ """
384
+ logger.info("Fetching results from ScandEval benchmark...")
385
+
386
+ response = requests.get("https://scandeval.com/scandeval_benchmark_results.jsonl")
387
+ response.raise_for_status()
388
+ records = [
389
+ json.loads(dct_str)
390
+ for dct_str in response.text.split("\n")
391
+ if dct_str.strip("\n")
392
+ ]
393
+
394
+ # Build a dictionary of languages -> results-dataframes, whose indices are the
395
+ # models and columns are the tasks.
396
+ results_dfs = dict()
397
+ for language in {dataset.language for dataset in DATASETS}:
398
+ possible_dataset_names = {
399
+ dataset.name for dataset in DATASETS if dataset.language == language
400
+ }
401
+ data_dict = defaultdict(dict)
402
+ for record in records:
403
+ model_name = record["model"]
404
+ dataset_name = record["dataset"]
405
+ if dataset_name in possible_dataset_names:
406
+ dataset = next(
407
+ dataset for dataset in DATASETS if dataset.name == dataset_name
408
+ )
409
+ results_dict = record['results']['total']
410
+ score = results_dict.get(
411
+ f"test_{dataset.task.metric}", results_dict.get(dataset.task.metric)
412
+ )
413
+ if dataset.task in data_dict[model_name]:
414
+ data_dict[model_name][dataset.task].append(score)
415
+ else:
416
+ data_dict[model_name][dataset.task] = [score]
417
+ results_df = pd.DataFrame(data_dict).T.map(
418
+ lambda list_or_nan:
419
+ np.mean(list_or_nan) if list_or_nan == list_or_nan else list_or_nan
420
+ ).dropna()
421
+ if any(task not in results_df.columns for task in ALL_TASKS):
422
+ results_dfs[language] = pd.DataFrame()
423
+ else:
424
+ results_dfs[language] = results_df
425
+
426
+ logger.info("Successfully fetched results from ScandEval benchmark.")
427
+
428
+ return results_dfs
429
+
430
  if __name__ == "__main__":
431
  main()