Clémentine commited on
Commit
b1a1395
1 Parent(s): ccefec9

Refactor 2 - added plotting back

Browse files

Only takes into account last submissions, but we have no way to go back at eval date apart from loading info from git commit of results files.
Also updated speed with gradio concurrency limit param

app.py CHANGED
@@ -31,18 +31,15 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
32
  from src.tools.collections import update_collections
33
  from src.tools.plots import (
34
- HUMAN_BASELINES,
35
  create_metric_plot_obj,
36
  create_plot_df,
37
  create_scores_df,
38
- join_model_info_with_results,
39
  )
40
 
41
 
42
  def restart_space():
43
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
44
 
45
-
46
  try:
47
  snapshot_download(
48
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
@@ -57,13 +54,11 @@ except Exception:
57
  restart_space()
58
 
59
 
60
- original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
61
  update_collections(original_df.copy())
62
  leaderboard_df = original_df.copy()
63
 
64
- # models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
65
- # plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
66
- # to_be_dumped = f"models = {repr(models)}\n"
67
 
68
  (
69
  finished_eval_queue_df,
@@ -72,16 +67,6 @@ leaderboard_df = original_df.copy()
72
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
73
 
74
 
75
- # Basics
76
- #def change_tab(query_param: str):
77
- # query_param = query_param.replace("'", '"')
78
- # query_param = json.loads(query_param)
79
- # if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
80
- # return gr.Tabs.update(selected=1)
81
- # else:
82
- # return gr.Tabs.update(selected=0)
83
-
84
-
85
  # Searching and filtering
86
  def update_table(
87
  hidden_df: pd.DataFrame,
@@ -247,6 +232,7 @@ with demo:
247
  search_bar,
248
  ],
249
  leaderboard_table,
 
250
  )
251
  shown_columns.change(
252
  update_table,
@@ -261,6 +247,7 @@ with demo:
261
  ],
262
  leaderboard_table,
263
  queue=True,
 
264
  )
265
  filter_columns_type.change(
266
  update_table,
@@ -275,6 +262,7 @@ with demo:
275
  ],
276
  leaderboard_table,
277
  queue=True,
 
278
  )
279
  filter_columns_precision.change(
280
  update_table,
@@ -289,6 +277,7 @@ with demo:
289
  ],
290
  leaderboard_table,
291
  queue=True,
 
292
  )
293
  filter_columns_size.change(
294
  update_table,
@@ -303,6 +292,7 @@ with demo:
303
  ],
304
  leaderboard_table,
305
  queue=True,
 
306
  )
307
  deleted_models_visibility.change(
308
  update_table,
@@ -317,27 +307,25 @@ with demo:
317
  ],
318
  leaderboard_table,
319
  queue=True,
 
320
  )
321
 
322
- # with gr.TabItem("📈
323
- # evolution through time", elem_id="llm-benchmark-tab-table", id=4):
324
- # with gr.Row():
325
- # with gr.Column():
326
- # chart = create_metric_plot_obj(
327
- # plot_df,
328
- # ["Average ⬆️"],
329
- # HUMAN_BASELINES,
330
- # title="Average of Top Scores and Human Baseline Over Time",
331
- # )
332
- # gr.Plot(value=chart, interactive=False, width=500, height=500)
333
- # with gr.Column():
334
- # chart = create_metric_plot_obj(
335
- # plot_df,
336
- # ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K", "DROP"],
337
- # HUMAN_BASELINES,
338
- # title="Top Scores and Human Baseline Over Time",
339
- # )
340
- # gr.Plot(value=chart, interactive=False, width=500, height=500)
341
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
342
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
343
 
@@ -440,14 +428,6 @@ with demo:
440
  show_copy_button=True,
441
  )
442
 
443
- #dummy = gr.Textbox(visible=False)
444
- #demo.load(
445
- # change_tab,
446
- # dummy,
447
- # tabs,
448
- # js=get_window_url_params,
449
- #)
450
-
451
  scheduler = BackgroundScheduler()
452
  scheduler.add_job(restart_space, "interval", seconds=1800)
453
  scheduler.start()
 
31
  from src.submission.submit import add_new_eval
32
  from src.tools.collections import update_collections
33
  from src.tools.plots import (
 
34
  create_metric_plot_obj,
35
  create_plot_df,
36
  create_scores_df,
 
37
  )
38
 
39
 
40
  def restart_space():
41
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
42
 
 
43
  try:
44
  snapshot_download(
45
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
 
54
  restart_space()
55
 
56
 
57
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
58
  update_collections(original_df.copy())
59
  leaderboard_df = original_df.copy()
60
 
61
+ plot_df = create_plot_df(create_scores_df(raw_data))
 
 
62
 
63
  (
64
  finished_eval_queue_df,
 
67
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
68
 
69
 
 
 
 
 
 
 
 
 
 
 
70
  # Searching and filtering
71
  def update_table(
72
  hidden_df: pd.DataFrame,
 
232
  search_bar,
233
  ],
234
  leaderboard_table,
235
+ concurrency_limit=None,
236
  )
237
  shown_columns.change(
238
  update_table,
 
247
  ],
248
  leaderboard_table,
249
  queue=True,
250
+ concurrency_limit=None,
251
  )
252
  filter_columns_type.change(
253
  update_table,
 
262
  ],
263
  leaderboard_table,
264
  queue=True,
265
+ concurrency_limit=None,
266
  )
267
  filter_columns_precision.change(
268
  update_table,
 
277
  ],
278
  leaderboard_table,
279
  queue=True,
280
+ concurrency_limit=None,
281
  )
282
  filter_columns_size.change(
283
  update_table,
 
292
  ],
293
  leaderboard_table,
294
  queue=True,
295
+ concurrency_limit=None,
296
  )
297
  deleted_models_visibility.change(
298
  update_table,
 
307
  ],
308
  leaderboard_table,
309
  queue=True,
310
+ concurrency_limit=None,
311
  )
312
 
313
+ with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
314
+ with gr.Row():
315
+ with gr.Column():
316
+ chart = create_metric_plot_obj(
317
+ plot_df,
318
+ [AutoEvalColumn.average.name],
319
+ title="Average of Top Scores and Human Baseline Over Time (from last update)",
320
+ )
321
+ gr.Plot(value=chart, min_width=500)
322
+ with gr.Column():
323
+ chart = create_metric_plot_obj(
324
+ plot_df,
325
+ BENCHMARK_COLS,
326
+ title="Top Scores and Human Baseline Over Time (from last update)",
327
+ )
328
+ gr.Plot(value=chart, min_width=500)
 
 
 
329
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
330
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
331
 
 
428
  show_copy_button=True,
429
  )
430
 
 
 
 
 
 
 
 
 
431
  scheduler = BackgroundScheduler()
432
  scheduler.add_job(restart_space, "interval", seconds=1800)
433
  scheduler.start()
src/display/formatting.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
 
2
 
3
  from huggingface_hub import HfApi
 
 
4
 
5
  API = HfApi()
6
 
 
1
  import os
2
+ from datetime import datetime, timezone
3
 
4
  from huggingface_hub import HfApi
5
+ from huggingface_hub.hf_api import ModelInfo
6
+
7
 
8
  API = HfApi()
9
 
src/display/utils.py CHANGED
@@ -60,7 +60,7 @@ baseline_row = {
60
  AutoEvalColumn.model.name: "<p>Baseline</p>",
61
  AutoEvalColumn.revision.name: "N/A",
62
  AutoEvalColumn.precision.name: None,
63
- AutoEvalColumn.average.name: 25.0,
64
  AutoEvalColumn.arc.name: 25.0,
65
  AutoEvalColumn.hellaswag.name: 25.0,
66
  AutoEvalColumn.mmlu.name: 25.0,
@@ -72,19 +72,43 @@ baseline_row = {
72
  AutoEvalColumn.model_type.name: "",
73
  }
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  @dataclass
77
- class ModelInfo:
78
  name: str
79
  symbol: str # emoji
80
 
81
 
82
  class ModelType(Enum):
83
- PT = ModelInfo(name="pretrained", symbol="🟢")
84
- FT = ModelInfo(name="fine-tuned", symbol="🔶")
85
- IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
86
- RL = ModelInfo(name="RL-tuned", symbol="🟦")
87
- Unknown = ModelInfo(name="", symbol="?")
88
 
89
  def to_str(self, separator=" "):
90
  return f"{self.value.symbol}{separator}{self.value.name}"
@@ -128,7 +152,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
128
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
129
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
130
 
131
- BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.col_name in fields(AutoEvalColumn)]
132
 
133
  NUMERIC_INTERVALS = {
134
  "?": pd.Interval(-1, 0, closed="right"),
 
60
  AutoEvalColumn.model.name: "<p>Baseline</p>",
61
  AutoEvalColumn.revision.name: "N/A",
62
  AutoEvalColumn.precision.name: None,
63
+ AutoEvalColumn.average.name: 31.0,
64
  AutoEvalColumn.arc.name: 25.0,
65
  AutoEvalColumn.hellaswag.name: 25.0,
66
  AutoEvalColumn.mmlu.name: 25.0,
 
72
  AutoEvalColumn.model_type.name: "",
73
  }
74
 
75
+ # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
76
+ # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
77
+ # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
78
+ # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
79
+ # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
80
+ # Drop: https://leaderboard.allenai.org/drop/submissions/public
81
+ # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
82
+ # GSM8K: paper
83
+ # Define the human baselines
84
+ human_baseline_row = {
85
+ AutoEvalColumn.model.name: "<p>Human performance</p>",
86
+ AutoEvalColumn.revision.name: "N/A",
87
+ AutoEvalColumn.precision.name: None,
88
+ AutoEvalColumn.average.name: 92.75,
89
+ AutoEvalColumn.arc.name: 80.0,
90
+ AutoEvalColumn.hellaswag.name: 95.0,
91
+ AutoEvalColumn.mmlu.name: 89.8,
92
+ AutoEvalColumn.truthfulqa.name: 94.0,
93
+ AutoEvalColumn.winogrande.name: 94.0,
94
+ AutoEvalColumn.gsm8k.name: 100,
95
+ AutoEvalColumn.drop.name: 96.42,
96
+ AutoEvalColumn.dummy.name: "human_baseline",
97
+ AutoEvalColumn.model_type.name: "",
98
+ }
99
 
100
  @dataclass
101
+ class ModelTypeDetails:
102
  name: str
103
  symbol: str # emoji
104
 
105
 
106
  class ModelType(Enum):
107
+ PT = ModelTypeDetails(name="pretrained", symbol="🟢")
108
+ FT = ModelTypeDetails(name="fine-tuned", symbol="🔶")
109
+ IFT = ModelTypeDetails(name="instruction-tuned", symbol="⭕")
110
+ RL = ModelTypeDetails(name="RL-tuned", symbol="🟦")
111
+ Unknown = ModelTypeDetails(name="", symbol="?")
112
 
113
  def to_str(self, separator=" "):
114
  return f"{self.value.symbol}{separator}{self.value.name}"
 
152
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
153
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
154
 
155
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
156
 
157
  NUMERIC_INTERVALS = {
158
  "?": pd.Interval(-1, 0, closed="right"),
src/leaderboard/read_evals.py CHANGED
@@ -3,9 +3,9 @@ import json
3
  import math
4
  import os
5
  from dataclasses import dataclass
6
- from typing import Dict, List, Tuple
7
 
8
  import dateutil
 
9
  import numpy as np
10
 
11
  from src.display.formatting import make_clickable_model
@@ -61,8 +61,6 @@ class EvalResult:
61
  still_on_hub, error = is_model_on_hub(
62
  full_model, config.get("model_sha", "main"), trust_remote_code=True
63
  )
64
- if not still_on_hub:
65
- print(full_model, error)
66
 
67
  # Extract results available in this file (some results are split in several files)
68
  results = {}
@@ -100,7 +98,6 @@ class EvalResult:
100
  results=results,
101
  precision=precision, # todo model_type=, weight_type=
102
  revision=config.get("model_sha", ""),
103
- date=config.get("submission_date", ""),
104
  still_on_hub=still_on_hub,
105
  )
106
 
@@ -114,6 +111,7 @@ class EvalResult:
114
  self.license = request.get("license", "?")
115
  self.likes = request.get("likes", 0)
116
  self.num_params = request.get("params", 0)
 
117
  except Exception:
118
  print(f"Could not find request file for {self.org}/{self.model}")
119
 
@@ -162,7 +160,7 @@ def get_request_file_for_model(model_name, precision):
162
  return request_file
163
 
164
 
165
- def get_eval_results(results_path: str) -> List[EvalResult]:
166
  json_filepaths = []
167
 
168
  for root, _, files in os.walk(results_path):
@@ -196,7 +194,8 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
196
  results = []
197
  for v in eval_results.values():
198
  try:
199
- results.append(v.to_dict())
 
200
  except KeyError: # not all eval values present
201
  continue
202
 
 
3
  import math
4
  import os
5
  from dataclasses import dataclass
 
6
 
7
  import dateutil
8
+ from datetime import datetime
9
  import numpy as np
10
 
11
  from src.display.formatting import make_clickable_model
 
61
  still_on_hub, error = is_model_on_hub(
62
  full_model, config.get("model_sha", "main"), trust_remote_code=True
63
  )
 
 
64
 
65
  # Extract results available in this file (some results are split in several files)
66
  results = {}
 
98
  results=results,
99
  precision=precision, # todo model_type=, weight_type=
100
  revision=config.get("model_sha", ""),
 
101
  still_on_hub=still_on_hub,
102
  )
103
 
 
111
  self.license = request.get("license", "?")
112
  self.likes = request.get("likes", 0)
113
  self.num_params = request.get("params", 0)
114
+ self.date = request.get("submitted_time", "")
115
  except Exception:
116
  print(f"Could not find request file for {self.org}/{self.model}")
117
 
 
160
  return request_file
161
 
162
 
163
+ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
164
  json_filepaths = []
165
 
166
  for root, _, files in os.walk(results_path):
 
194
  results = []
195
  for v in eval_results.values():
196
  try:
197
+ v.to_dict() # we test if the dict version is complete
198
+ results.append(v)
199
  except KeyError: # not all eval values present
200
  continue
201
 
src/populate.py CHANGED
@@ -6,21 +6,22 @@ import pandas as pd
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
  from src.leaderboard.filter_models import filter_models
9
- from src.leaderboard.read_evals import get_eval_results
10
 
11
 
12
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
- all_data = get_eval_results(results_path)
14
- all_data.append(baseline_row)
15
- filter_models(all_data)
 
16
 
17
- df = pd.DataFrame.from_records(all_data)
18
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
  df = df[cols].round(decimals=2)
20
 
21
  # filter out if any of the benchmarks have not been produced
22
  df = df[has_no_nan_values(df, benchmark_cols)]
23
- return df
24
 
25
 
26
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
  from src.leaderboard.filter_models import filter_models
9
+ from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ raw_data = get_raw_eval_results(results_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+ all_data_json.append(baseline_row)
16
+ filter_models(all_data_json)
17
 
18
+ df = pd.DataFrame.from_records(all_data_json)
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
  df = df[cols].round(decimals=2)
21
 
22
  # filter out if any of the benchmarks have not been produced
23
  df = df[has_no_nan_values(df, benchmark_cols)]
24
+ return raw_data, df
25
 
26
 
27
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
src/submission/check_validity.py CHANGED
@@ -55,7 +55,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
55
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
56
  try:
57
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
58
- except (AttributeError, TypeError):
59
  try:
60
  size_match = re.search(size_pattern, model_info.modelId.lower())
61
  model_size = size_match.group(0)
 
55
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
56
  try:
57
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
58
+ except (AttributeError, TypeError ):
59
  try:
60
  size_match = re.search(size_pattern, model_info.modelId.lower())
61
  model_size = size_match.group(0)
src/tools/plots.py CHANGED
@@ -1,153 +1,84 @@
1
- import pickle
2
- from datetime import datetime, timezone
3
- from typing import Any, Dict, List, Tuple
4
-
5
  import pandas as pd
 
6
  import plotly.express as px
7
  from plotly.graph_objs import Figure
8
 
9
  from src.leaderboard.filter_models import FLAGGED_MODELS
 
 
10
 
11
- # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
12
- # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
13
- # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
14
- # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
15
- # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
16
- # Define the human baselines
17
- HUMAN_BASELINES = {
18
- "Average ⬆️": 0.897 * 100,
19
- "ARC": 0.80 * 100,
20
- "HellaSwag": 0.95 * 100,
21
- "MMLU": 0.898 * 100,
22
- "TruthfulQA": 0.94 * 100,
23
- }
24
-
25
-
26
- def to_datetime(model_info: Tuple[str, Any]) -> datetime:
27
- """
28
- Converts the lastModified attribute of the object to datetime.
29
-
30
- :param model_info: A tuple containing the name and object.
31
- The object must have a lastModified attribute
32
- with a string representing the date and time.
33
- :return: A datetime object converted from the lastModified attribute of the input object.
34
- """
35
- name, obj = model_info
36
- return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
37
 
38
 
39
- def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
40
- """
41
- Integrates model information with the results DataFrame by matching 'Model sha'.
42
- :param results_df: A DataFrame containing results information including 'Model sha' column.
43
- :return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
44
  """
45
- # copy dataframe to avoid modifying the original
46
- df = results_df.copy(deep=True)
47
-
48
- # Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
49
- df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
50
-
51
- # load cache from disk
52
- try:
53
- with open("model_info_cache.pkl", "rb") as f:
54
- model_info_cache = pickle.load(f)
55
- except (EOFError, FileNotFoundError):
56
- model_info_cache = {}
57
-
58
- # Sort date strings using datetime objects as keys
59
- sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
60
- df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)
61
-
62
- # Define the date format string
63
- date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
64
-
65
- # Iterate over sorted_dates and update the dataframe
66
- for name, obj in sorted_dates:
67
- # Convert the lastModified string to a datetime object
68
- last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
69
-
70
- # Update the "Results Date" column where "Model sha" equals obj.sha
71
- df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
72
- return df
73
 
74
-
75
- def create_scores_df(results_df: pd.DataFrame) -> pd.DataFrame:
76
- """
77
- Generates a DataFrame containing the maximum scores until each result date.
78
-
79
- :param results_df: A DataFrame containing result information including metric scores and result dates.
80
- :return: A new DataFrame containing the maximum scores until each result date for every metric.
81
  """
82
- # Step 1: Ensure 'Results Date' is in datetime format and sort the DataFrame by it
83
- results_df["Results Date"] = pd.to_datetime(results_df["Results Date"])
84
- results_df.sort_values(by="Results Date", inplace=True)
 
85
 
86
  # Step 2: Initialize the scores dictionary
87
- scores = {
88
- "Average ⬆️": [],
89
- "ARC": [],
90
- "HellaSwag": [],
91
- "MMLU": [],
92
- "TruthfulQA": [],
93
- "Result Date": [],
94
- "Model Name": [],
95
- }
96
 
97
  # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
98
- for i, row in results_df.iterrows():
99
- date = row["Results Date"]
100
- for column in scores.keys():
101
- if column == "Result Date":
102
- if not scores[column] or scores[column][-1] <= date:
103
- scores[column].append(date)
 
104
  continue
105
- if column == "Model Name":
106
- scores[column].append(row["model_name_for_query"])
107
- continue
108
- current_max = scores[column][-1] if scores[column] else float("-inf")
109
- scores[column].append(max(current_max, row[column]))
110
 
111
- # Step 4: Convert the dictionary to a DataFrame
112
- return pd.DataFrame(scores)
 
 
 
 
 
 
 
 
 
 
 
113
 
 
 
114
 
115
- def create_plot_df(scores_df: pd.DataFrame) -> pd.DataFrame:
 
116
  """
117
  Transforms the scores DataFrame into a new format suitable for plotting.
118
 
119
- :param scores_df: A DataFrame containing metric scores and result dates.
120
  :return: A new DataFrame reshaped for plotting purposes.
121
  """
122
- # Sample columns
123
- cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
124
-
125
  # Initialize the list to store DataFrames
126
  dfs = []
127
 
128
  # Iterate over the cols and create a new DataFrame for each column
129
- for col in cols:
130
- d = scores_df[[col, "Model Name", "Result Date"]].copy().reset_index(drop=True)
131
- d["Metric Name"] = col
132
- d.rename(columns={col: "Metric Value"}, inplace=True)
133
  dfs.append(d)
134
 
135
  # Concatenate all the created DataFrames
136
  concat_df = pd.concat(dfs, ignore_index=True)
137
 
138
- # Sort values by 'Result Date'
139
- concat_df.sort_values(by="Result Date", inplace=True)
140
- concat_df.reset_index(drop=True, inplace=True)
141
-
142
- # Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
143
- concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
144
-
145
  concat_df.reset_index(drop=True, inplace=True)
146
  return concat_df
147
 
148
 
149
  def create_metric_plot_obj(
150
- df: pd.DataFrame, metrics: List[str], human_baselines: Dict[str, float], title: str
151
  ) -> Figure:
152
  """
153
  Create a Plotly figure object with lines representing different metrics
@@ -156,27 +87,25 @@ def create_metric_plot_obj(
156
  :param df: The DataFrame containing the metric values, names, and dates.
157
  :param metrics: A list of strings representing the names of the metrics
158
  to be included in the plot.
159
- :param human_baselines: A dictionary where keys are metric names
160
- and values are human baseline values for the metrics.
161
  :param title: A string representing the title of the plot.
162
  :return: A Plotly figure object with lines representing metrics and
163
  horizontal dotted lines representing human baselines.
164
  """
165
 
166
  # Filter the DataFrame based on the specified metrics
167
- df = df[df["Metric Name"].isin(metrics)]
168
 
169
  # Filter the human baselines based on the specified metrics
170
- filtered_human_baselines = {k: v for k, v in human_baselines.items() if k in metrics}
171
 
172
  # Create a line figure using plotly express with specified markers and custom data
173
  fig = px.line(
174
  df,
175
- x="Result Date",
176
- y="Metric Value",
177
- color="Metric Name",
178
  markers=True,
179
- custom_data=["Metric Name", "Metric Value", "Model Name"],
180
  title=title,
181
  )
182
 
 
 
 
 
 
1
  import pandas as pd
2
+ import numpy as np
3
  import plotly.express as px
4
  from plotly.graph_objs import Figure
5
 
6
  from src.leaderboard.filter_models import FLAGGED_MODELS
7
+ from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
8
+ from src.leaderboard.read_evals import EvalResult
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
+ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
 
 
 
 
13
  """
14
+ Generates a DataFrame containing the maximum scores until each date.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ :param results_df: A DataFrame containing result information including metric scores and dates.
17
+ :return: A new DataFrame containing the maximum scores until each date for every metric.
 
 
 
 
 
18
  """
19
+ # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
+ results_df = pd.DataFrame(raw_data)
21
+ #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
+ results_df.sort_values(by="date", inplace=True)
23
 
24
  # Step 2: Initialize the scores dictionary
25
+ scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
 
 
 
 
 
 
 
 
26
 
27
  # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
28
+ for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
29
+ current_max = 0
30
+ last_date = ""
31
+ column = task.col_name
32
+ for _, row in results_df.iterrows():
33
+ current_model = row["full_model"]
34
+ if current_model in FLAGGED_MODELS:
35
  continue
 
 
 
 
 
36
 
37
+ current_date = row["date"]
38
+ if task.benchmark == "Average":
39
+ current_score = np.mean(list(row["results"].values()))
40
+ else:
41
+ current_score = row["results"][task.benchmark]
42
+
43
+ if current_score > current_max:
44
+ if current_date == last_date and len(scores[column]) > 0:
45
+ scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
46
+ else:
47
+ scores[column].append({"model": current_model, "date": current_date, "score": current_score})
48
+ current_max = current_score
49
+ last_date = current_date
50
 
51
+ # Step 4: Return all dictionaries as DataFrames
52
+ return {k: pd.DataFrame(v) for k, v in scores.items()}
53
 
54
+
55
+ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
56
  """
57
  Transforms the scores DataFrame into a new format suitable for plotting.
58
 
59
+ :param scores_df: A DataFrame containing metric scores and dates.
60
  :return: A new DataFrame reshaped for plotting purposes.
61
  """
 
 
 
62
  # Initialize the list to store DataFrames
63
  dfs = []
64
 
65
  # Iterate over the cols and create a new DataFrame for each column
66
+ for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
67
+ d = scores_df[col].reset_index(drop=True)
68
+ d["task"] = col
 
69
  dfs.append(d)
70
 
71
  # Concatenate all the created DataFrames
72
  concat_df = pd.concat(dfs, ignore_index=True)
73
 
74
+ # Sort values by 'date'
75
+ concat_df.sort_values(by="date", inplace=True)
 
 
 
 
 
76
  concat_df.reset_index(drop=True, inplace=True)
77
  return concat_df
78
 
79
 
80
  def create_metric_plot_obj(
81
+ df: pd.DataFrame, metrics: list[str], title: str
82
  ) -> Figure:
83
  """
84
  Create a Plotly figure object with lines representing different metrics
 
87
  :param df: The DataFrame containing the metric values, names, and dates.
88
  :param metrics: A list of strings representing the names of the metrics
89
  to be included in the plot.
 
 
90
  :param title: A string representing the title of the plot.
91
  :return: A Plotly figure object with lines representing metrics and
92
  horizontal dotted lines representing human baselines.
93
  """
94
 
95
  # Filter the DataFrame based on the specified metrics
96
+ df = df[df["task"].isin(metrics)]
97
 
98
  # Filter the human baselines based on the specified metrics
99
+ filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
100
 
101
  # Create a line figure using plotly express with specified markers and custom data
102
  fig = px.line(
103
  df,
104
+ x="date",
105
+ y="score",
106
+ color="task",
107
  markers=True,
108
+ custom_data=["task", "score", "model"],
109
  title=title,
110
  )
111