yzabc007 commited on
Commit
37b3751
·
1 Parent(s): da96aa6

Update space

Browse files
Files changed (5) hide show
  1. app.py +116 -10
  2. src/about.py +17 -0
  3. src/display/utils.py +22 -0
  4. src/leaderboard/read_evals.py +130 -7
  5. src/populate.py +32 -23
app.py CHANGED
@@ -97,8 +97,11 @@ def init_leaderboard(dataframe):
97
  interactive=False,
98
  )
99
 
100
- model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
101
- model_leaderboard_df = get_model_leaderboard_df(model_result_path)
 
 
 
102
 
103
  def overall_leaderboard(dataframe):
104
  if dataframe is None or dataframe.empty:
@@ -118,6 +121,25 @@ def overall_leaderboard(dataframe):
118
 
119
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  demo = gr.Blocks(css=custom_css)
122
  with demo:
123
  gr.HTML(TITLE)
@@ -126,33 +148,117 @@ with demo:
126
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
127
 
128
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
129
- # leaderboard = init_leaderboard(LEADERBOARD_DF)
130
- leaderboard = overall_leaderboard(model_leaderboard_df)
131
 
132
 
133
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
134
- leaderboard = overall_leaderboard(model_leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
137
 
138
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
139
  with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
140
- leaderboard = overall_leaderboard(model_leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"):
143
- leaderboard = overall_leaderboard(model_leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
146
- leaderboard = overall_leaderboard(model_leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
 
149
  with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
150
 
151
  with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
152
- leaderboard = overall_leaderboard(model_leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):
155
- leaderboard = overall_leaderboard(model_leaderboard_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
 
158
  with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=4):
 
97
  interactive=False,
98
  )
99
 
100
+ # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
101
+ # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
102
+ model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
103
+ # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
104
+
105
 
106
  def overall_leaderboard(dataframe):
107
  if dataframe is None or dataframe.empty:
 
121
 
122
 
123
 
124
+ def overview_leaderboard(dataframe):
125
+ if dataframe is None or dataframe.empty:
126
+ raise ValueError("Leaderboard DataFrame is empty or None.")
127
+
128
+ return Leaderboard(
129
+ value=dataframe,
130
+ datatype=[c.type for c in fields(AutoEvalColumn)],
131
+ select_columns=None,
132
+ search_columns=SearchColumns(primary_column=AutoEvalColumn.model.name, secondary_columns=[],
133
+ placeholder="Search by the model name",
134
+ label="Searching"),
135
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
136
+ filter_columns=None,
137
+ interactive=False,
138
+ )
139
+
140
+
141
+
142
+
143
  demo = gr.Blocks(css=custom_css)
144
  with demo:
145
  gr.HTML(TITLE)
 
148
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
149
 
150
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
151
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
152
+ # leaderboard = overview_leaderboard(model_leaderboard_df)
153
 
154
 
155
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
156
+
157
+ leaderboard = overall_leaderboard(
158
+ get_model_leaderboard_df(
159
+ model_result_path,
160
+ benchmark_cols=[
161
+ AutoEvalColumn.rank_overall.name,
162
+ AutoEvalColumn.model.name,
163
+ AutoEvalColumn.score_overall.name,
164
+ AutoEvalColumn.sd_overall.name,
165
+ AutoEvalColumn.license.name,
166
+ AutoEvalColumn.organization.name,
167
+ AutoEvalColumn.knowledge_cutoff.name,
168
+ ],
169
+ rank_col=[AutoEvalColumn.rank_overall.name],
170
+ ))
171
 
172
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
173
 
174
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
175
  with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
176
+ leaderboard = overall_leaderboard(
177
+ get_model_leaderboard_df(
178
+ model_result_path,
179
+ benchmark_cols=[
180
+ AutoEvalColumn.rank_math_algebra.name,
181
+ AutoEvalColumn.model.name,
182
+ AutoEvalColumn.score_math_algebra.name,
183
+ AutoEvalColumn.sd_math_algebra.name,
184
+ AutoEvalColumn.license.name,
185
+ AutoEvalColumn.organization.name,
186
+ AutoEvalColumn.knowledge_cutoff.name,
187
+ ],
188
+ rank_col=[AutoEvalColumn.rank_math_algebra.name],
189
+ )
190
+ )
191
 
192
  with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"):
193
+ leaderboard = overall_leaderboard(
194
+ get_model_leaderboard_df(
195
+ model_result_path,
196
+ benchmark_cols=[
197
+ AutoEvalColumn.rank_math_geometry.name,
198
+ AutoEvalColumn.model.name,
199
+ AutoEvalColumn.score_math_geometry.name,
200
+ AutoEvalColumn.sd_math_geometry.name,
201
+ AutoEvalColumn.license.name,
202
+ AutoEvalColumn.organization.name,
203
+ AutoEvalColumn.knowledge_cutoff.name,
204
+ ],
205
+ rank_col=[AutoEvalColumn.rank_math_geometry.name],
206
+ )
207
+ )
208
 
209
  with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
210
+ leaderboard = overall_leaderboard(
211
+ get_model_leaderboard_df(
212
+ model_result_path,
213
+ benchmark_cols=[
214
+ AutoEvalColumn.rank_math_probability.name,
215
+ AutoEvalColumn.model.name,
216
+ AutoEvalColumn.score_math_probability.name,
217
+ AutoEvalColumn.sd_math_probability.name,
218
+ AutoEvalColumn.license.name,
219
+ AutoEvalColumn.organization.name,
220
+ AutoEvalColumn.knowledge_cutoff.name,
221
+ ],
222
+ rank_col=[AutoEvalColumn.rank_math_probability.name],
223
+ )
224
+ )
225
 
226
 
227
  with gr.TabItem("🧠 Reasoning", elem_id="reasonong-tab-table", id=3):
228
 
229
  with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
230
+ leaderboard = overall_leaderboard(
231
+ get_model_leaderboard_df(
232
+ model_result_path,
233
+ benchmark_cols=[
234
+ AutoEvalColumn.rank_reason_logical.name,
235
+ AutoEvalColumn.model.name,
236
+ AutoEvalColumn.score_reason_logical.name,
237
+ AutoEvalColumn.sd_reason_logical.name,
238
+ AutoEvalColumn.license.name,
239
+ AutoEvalColumn.organization.name,
240
+ AutoEvalColumn.knowledge_cutoff.name,
241
+ ],
242
+ rank_col=[AutoEvalColumn.rank_reason_logical.name],
243
+ )
244
+ )
245
 
246
  with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):
247
+ leaderboard = overall_leaderboard(
248
+ get_model_leaderboard_df(
249
+ model_result_path,
250
+ benchmark_cols=[
251
+ AutoEvalColumn.rank_reason_social.name,
252
+ AutoEvalColumn.model.name,
253
+ AutoEvalColumn.score_reason_social.name,
254
+ AutoEvalColumn.sd_reason_social.name,
255
+ AutoEvalColumn.license.name,
256
+ AutoEvalColumn.organization.name,
257
+ AutoEvalColumn.knowledge_cutoff.name,
258
+ ],
259
+ rank_col=[AutoEvalColumn.rank_reason_social.name],
260
+ )
261
+ )
262
 
263
 
264
  with gr.TabItem("</> Coding", elem_id="coding-tab-table", id=4):
src/about.py CHANGED
@@ -1,6 +1,23 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  @dataclass
5
  class Domain:
6
  dimension: str
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
+ # @dataclass
6
+ # class Ranking:
7
+ # dimension: str
8
+ # metric: str
9
+ # col_name: str
10
+
11
+ # class Rankings(Enum):
12
+ # # dimension_key in the json file, metric_key in the json file, name to display in the leaderboard
13
+ # rank0 = Ranking("overall", "Avg Score", "Overall")
14
+ # rank1 = Ranking("math_algebra", "Avg Score", "Math (Algebra)")
15
+ # rank2 = Ranking("math_geometry", "Avg Score", "Math (Geometry)")
16
+ # rank3 = Ranking("math_prob", "Avg Score", "Math (Probability)")
17
+ # rank4 = Ranking("reason_logical", "Avg Score", "Logical Reasoning")
18
+ # rank5 = Ranking("reason_social", "Avg Score", "Social Reasoning")
19
+
20
+
21
  @dataclass
22
  class Domain:
23
  dimension: str
src/display/utils.py CHANGED
@@ -63,6 +63,28 @@ auto_eval_column_dict.append(["score", ColumnContent, field(default_factory=lamb
63
  auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  for task in Tasks:
68
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
 
63
  auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
+ # fine-graine dimensions
67
+ auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Overall", "number", True))])
68
+ auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Math (Algebra)", "number", True))])
69
+ auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Math (Geometry)", "number", True))])
70
+ auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Math (Probability)", "number", True))])
71
+ auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Logical Reasoning", "number", True))])
72
+ auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Social Reasoning", "number", True))])
73
+
74
+ auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("SD Overall", "number", True))])
75
+ auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("SD Math (Algebra)", "number", True))])
76
+ auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("SD Math (Geometry)", "number", True))])
77
+ auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("SD Math (Probability)", "number", True))])
78
+ auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("SD Logical Reasoning", "number", True))])
79
+ auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("SD Social Reasoning", "number", True))])
80
+
81
+ auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Overall", "number", True))])
82
+ auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math (Algebra)", "number", True))])
83
+ auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math (Geometry)", "number", True))])
84
+ auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Math (Probability)", "number", True))])
85
+ auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Logical Reasoning", "number", True))])
86
+ auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank Social Reasoning", "number", True))])
87
+
88
 
89
  for task in Tasks:
90
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
src/leaderboard/read_evals.py CHANGED
@@ -11,14 +11,10 @@ from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Domains
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
- # @dataclass
15
- # class RankResult:
16
-
17
-
18
 
19
  @dataclass
20
- class ModelResult:
21
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
22
  """
23
  eval_name: str
24
  full_model: str
@@ -74,7 +70,7 @@ class ModelResult:
74
 
75
  # AutoEvalColumn.precision.name: self.precision.value.name,
76
  # AutoEvalColumn.model_type.name: self.model_type.value.name,
77
- # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
78
  # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
79
  # AutoEvalColumn.architecture.name: self.architecture,
80
  # AutoEvalColumn.revision.name: self.revision,
@@ -83,6 +79,116 @@ class ModelResult:
83
  # AutoEvalColumn.params.name: self.num_params,
84
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
85
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  # for task in Tasks:
88
  # data_dict[task.value.col_name] = self.results[task.value.benchmark]
@@ -306,7 +412,23 @@ def get_raw_model_results(results_path: str) -> list[EvalResult]:
306
  # full_model='OpenAI/ChatGPT-4o-latest (2024-09-03)',
307
  # org='OpenAI', model='ChatGPT-4o-latest (2024-09-03)',
308
  # results={'overall': None}, license='Proprietary', knowledge_cutoff='2023/10')
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
 
 
 
310
  eval_name = eval_result.eval_name
311
  eval_results[eval_name] = eval_result
312
 
@@ -319,6 +441,7 @@ def get_raw_model_results(results_path: str) -> list[EvalResult]:
319
  results = []
320
  for v in eval_results.values():
321
  # print(v.to_dict())
 
322
  # {'eval_name': 'OpenAI_ChatGPT-4o-latest (2024-09-03)',
323
  # 'Model': '<a target="_blank" href="https://huggingface.co/OpenAI/ChatGPT-4o-latest (2024-09-03)"
324
  # style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">OpenAI/ChatGPT-4o-latest (2024-09-03)</a>',
 
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Domains
12
  from src.submission.check_validity import is_model_on_hub
13
 
 
 
 
 
14
 
15
  @dataclass
16
+ class RankResult:
17
+ """Represents one the overall ranking table
18
  """
19
  eval_name: str
20
  full_model: str
 
70
 
71
  # AutoEvalColumn.precision.name: self.precision.value.name,
72
  # AutoEvalColumn.model_type.name: self.model_type.value.name,
73
+ # AutoEvalColumn.model_type_symbol.name
74
  # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
75
  # AutoEvalColumn.architecture.name: self.architecture,
76
  # AutoEvalColumn.revision.name: self.revision,
 
79
  # AutoEvalColumn.params.name: self.num_params,
80
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
81
  }
82
+
83
+
84
+
85
+ @dataclass
86
+ class ModelResult:
87
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
88
+ """
89
+ eval_name: str
90
+ full_model: str
91
+ org: str
92
+ model: str
93
+ results: dict
94
+ license: str = "?"
95
+ knowledge_cutoff: str = ""
96
+
97
+ @classmethod
98
+ def init_from_json_dict(self, data):
99
+
100
+ config = data.get("config")
101
+ # Get model and org
102
+ model = config.get("model_name")
103
+ org = config.get("organization")
104
+ license = config.get("license")
105
+ knowledge_cutoff = config.get("knowledge_cutoff")
106
+
107
+ model_results = data.get("results")
108
+ new_results = {}
109
+ for k, v in model_results.items():
110
+ new_v = {}
111
+ for kk, vv in v.items():
112
+ if vv == 'N/A':
113
+ new_v[kk] = None
114
+ else:
115
+ new_v[kk] = vv
116
+
117
+ new_results[k] = new_v
118
+
119
+ # Extract results available in this file (some results are split in several files)
120
+ # results = {}
121
+ # for domain in Domains:
122
+ # domain = domain.value
123
+ # results[domain.dimension] = model_results.get(domain.dimension).get(domain.metric, None)
124
+
125
+ return self(
126
+ eval_name=f"{org}_{model}",
127
+ full_model=f"{org}/{model}",
128
+ org=org,
129
+ model=model,
130
+ results=new_results,
131
+ license=license,
132
+ knowledge_cutoff=knowledge_cutoff
133
+ )
134
+
135
+ def to_dict(self):
136
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
137
+
138
+ data_dict = {
139
+ # "eval_name": self.eval_name, # not a column, just a save name,
140
+ # AutoEvalColumn.model.name: make_clickable_model(self.full_model),
141
+ # AutoEvalColumn.rank.name: None, # placeholder for the rank
142
+ AutoEvalColumn.model.name: self.model,
143
+ # AutoEvalColumn.score.name: self.results[Domains.dim0.value.dimension],
144
+ # AutoEvalColumn.score_sd.name: None, # placeholder for the score sd
145
+
146
+ # AutoEvalColumn.score_overall.name: float(self.results.get("OVERALL").get("Average Score", None)),
147
+ # AutoEvalColumn.score_math_algebra.name: float(self.results.get("Algebra").get("Average Score", None)),
148
+ # AutoEvalColumn.score_math_geometry.name: float(self.results.get("Geometry").get("Average Score", None)),
149
+ # AutoEvalColumn.score_math_probability.name: float(self.results.get("Probability").get("Average Score", None)),
150
+ # AutoEvalColumn.score_reason_logical.name: float(self.results.get("Logical").get("Average Score", None)),
151
+ # AutoEvalColumn.score_reason_social.name: float(self.results.get("Social").get("Average Score", None)),
152
+
153
+ # AutoEvalColumn.sd_overall.name: float(self.results.get("OVERALL").get("Standard Deviation", None)),
154
+ # AutoEvalColumn.sd_math_algebra.name: float(self.results.get("Algebra").get("Standard Deviation", None)),
155
+ # AutoEvalColumn.sd_math_geometry.name: float(self.results.get("Geometry").get("Standard Deviation", None)),
156
+ # AutoEvalColumn.sd_math_probability.name: float(self.results.get("Probability").get("Standard Deviation", None)),
157
+ # AutoEvalColumn.sd_reason_logical.name: float(self.results.get("Logical").get("Standard Deviation", None)),
158
+ # AutoEvalColumn.sd_reason_social.name: float(self.results.get("Social").get("Standard Deviation", None)),
159
+
160
+ # AutoEvalColumn.rank_overall.name: int(self.results.get("OVERALL").get("Rank", None)),
161
+ # AutoEvalColumn.rank_math_algebra.name: int(self.results.get("Algebra").get("Rank", None)),
162
+ # AutoEvalColumn.rank_math_geometry.name: int(self.results.get("Geometry").get("Rank", None)),
163
+ # AutoEvalColumn.rank_math_probability.name: int(self.results.get("Probability").get("Rank", None)),
164
+ # AutoEvalColumn.rank_reason_logical.name: int(self.results.get("Logical").get("Rank", None)),
165
+ # AutoEvalColumn.rank_reason_social.name: int(self.results.get("Social").get("Rank", None)),
166
+
167
+ AutoEvalColumn.score_overall.name: self.results.get("OVERALL").get("Average Score", None),
168
+ AutoEvalColumn.score_math_algebra.name: self.results.get("Algebra").get("Average Score", None),
169
+ AutoEvalColumn.score_math_geometry.name: self.results.get("Geometry").get("Average Score", None),
170
+ AutoEvalColumn.score_math_probability.name: self.results.get("Probability").get("Average Score", None),
171
+ AutoEvalColumn.score_reason_logical.name: self.results.get("Logical").get("Average Score", None),
172
+ AutoEvalColumn.score_reason_social.name: self.results.get("Social").get("Average Score", None),
173
+
174
+ AutoEvalColumn.sd_overall.name: self.results.get("OVERALL").get("Standard Deviation", None),
175
+ AutoEvalColumn.sd_math_algebra.name: self.results.get("Algebra").get("Standard Deviation", None),
176
+ AutoEvalColumn.sd_math_geometry.name: self.results.get("Geometry").get("Standard Deviation", None),
177
+ AutoEvalColumn.sd_math_probability.name: self.results.get("Probability").get("Standard Deviation", None),
178
+ AutoEvalColumn.sd_reason_logical.name: self.results.get("Logical").get("Standard Deviation", None),
179
+ AutoEvalColumn.sd_reason_social.name: self.results.get("Social").get("Standard Deviation", None),
180
+
181
+ AutoEvalColumn.rank_overall.name: self.results.get("OVERALL").get("Rank", None),
182
+ AutoEvalColumn.rank_math_algebra.name: self.results.get("Algebra").get("Rank", None),
183
+ AutoEvalColumn.rank_math_geometry.name: self.results.get("Geometry").get("Rank", None),
184
+ AutoEvalColumn.rank_math_probability.name: self.results.get("Probability").get("Rank", None),
185
+ AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
186
+ AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
187
+
188
+ AutoEvalColumn.license.name: self.license,
189
+ AutoEvalColumn.organization.name: self.org,
190
+ AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
191
+ }
192
 
193
  # for task in Tasks:
194
  # data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
412
  # full_model='OpenAI/ChatGPT-4o-latest (2024-09-03)',
413
  # org='OpenAI', model='ChatGPT-4o-latest (2024-09-03)',
414
  # results={'overall': None}, license='Proprietary', knowledge_cutoff='2023/10')
415
+
416
+ # all_num_results = eval_result.results
417
+
418
+ # def get_terminal_values(data):
419
+ # terminal_values = []
420
+ # for key, value in data.items():
421
+ # if isinstance(value, dict):
422
+ # terminal_values.extend(get_terminal_values(value))
423
+ # else:
424
+ # terminal_values.append(value)
425
+ # return terminal_values
426
+
427
+ # all_values = get_terminal_values(all_num_results)
428
 
429
+ # if 'N/A' in all_values:
430
+ # continue
431
+
432
  eval_name = eval_result.eval_name
433
  eval_results[eval_name] = eval_result
434
 
 
441
  results = []
442
  for v in eval_results.values():
443
  # print(v.to_dict())
444
+ # exit()
445
  # {'eval_name': 'OpenAI_ChatGPT-4o-latest (2024-09-03)',
446
  # 'Model': '<a target="_blank" href="https://huggingface.co/OpenAI/ChatGPT-4o-latest (2024-09-03)"
447
  # style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">OpenAI/ChatGPT-4o-latest (2024-09-03)</a>',
src/populate.py CHANGED
@@ -9,44 +9,53 @@ from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_resul
9
 
10
 
11
 
12
- def get_overview_leaderboard_df(results_path: str) -> pd.DataFrame:
13
- """Creates a dataframe from all the individual experiment results"""
14
- raw_data = get_raw_eval_results(results_path, requests_path)
15
- all_data_json = [v.to_dict() for v in raw_data]
16
 
17
- df = pd.DataFrame.from_records(all_data_json)
18
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
- for col in cols:
20
- if col not in df.columns:
21
- df[col] = None
22
- else:
23
- df[col] = df[col].round(decimals=2)
24
 
25
- # filter out if any of the benchmarks have not been produced
26
- df = df[has_no_nan_values(df, benchmark_cols)]
27
- return df
28
 
29
 
30
 
31
- def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[]) -> pd.DataFrame:
32
  """Creates a dataframe from all the individual experiment results"""
33
  raw_data = get_raw_model_results(results_path)
34
  all_data_json = [v.to_dict() for v in raw_data]
35
 
36
  df = pd.DataFrame.from_records(all_data_json)
37
- df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
38
- df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
 
 
 
 
 
 
 
39
  # print(cols) # []
40
  # print(df.columns) # ['eval_name', 'Model', 'Hub License', 'Organization', 'Knowledge cutoff', 'Overall']
41
  # exit()
42
- for col in cols:
43
- if col not in df.columns:
44
- df[col] = None
45
- else:
46
- df = df[cols].round(decimals=2)
 
 
47
 
48
  # filter out if any of the benchmarks have not been produced
49
- # df = df[has_no_nan_values(df, benchmark_cols)]
50
  return df
51
 
52
 
 
9
 
10
 
11
 
12
+ # def get_overview_leaderboard_df(results_path: str) -> pd.DataFrame:
13
+ # """Creates a dataframe from all the individual experiment results"""
14
+ # raw_data = get_raw_eval_results(results_path, requests_path)
15
+ # all_data_json = [v.to_dict() for v in raw_data]
16
 
17
+ # df = pd.DataFrame.from_records(all_data_json)
18
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
+ # for col in cols:
20
+ # if col not in df.columns:
21
+ # df[col] = None
22
+ # else:
23
+ # df[col] = df[col].round(decimals=2)
24
 
25
+ # # filter out if any of the benchmarks have not been produced
26
+ # df = df[has_no_nan_values(df, benchmark_cols)]
27
+ # return df
28
 
29
 
30
 
31
+ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[], rank_col: list=[]) -> pd.DataFrame:
32
  """Creates a dataframe from all the individual experiment results"""
33
  raw_data = get_raw_model_results(results_path)
34
  all_data_json = [v.to_dict() for v in raw_data]
35
 
36
  df = pd.DataFrame.from_records(all_data_json)
37
+
38
+ df = df[benchmark_cols]
39
+ df = df.dropna(subset=benchmark_cols)
40
+
41
+ if rank_col:
42
+ df = df.sort_values(by=[rank_col[0]], ascending=True)
43
+
44
+ # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
45
+ # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
46
  # print(cols) # []
47
  # print(df.columns) # ['eval_name', 'Model', 'Hub License', 'Organization', 'Knowledge cutoff', 'Overall']
48
  # exit()
49
+ # only keep the columns that are in the cols list
50
+
51
+ # for col in cols:
52
+ # if col not in df.columns:
53
+ # df[col] = None
54
+ # else:
55
+ # df = df[cols].round(decimals=2)
56
 
57
  # filter out if any of the benchmarks have not been produced
58
+ df = df[has_no_nan_values(df, benchmark_cols)]
59
  return df
60
 
61