Terry Zhuo commited on
Commit
ae7a86d
1 Parent(s): 1e748fb
Files changed (2) hide show
  1. app.py +27 -8
  2. src/utils.py +1 -2
app.py CHANGED
@@ -109,7 +109,7 @@ def select_columns(df, columns):
109
  return filtered_df
110
 
111
 
112
- def filter_items(df, leaderboard_table, query):
113
  if query == "all":
114
  return df[leaderboard_table.columns]
115
  else:
@@ -118,6 +118,16 @@ def filter_items(df, leaderboard_table, query):
118
  return filtered_df[leaderboard_table.columns]
119
 
120
 
 
 
 
 
 
 
 
 
 
 
121
  def search_table(df, leaderboard_table, query):
122
  filtered_df = df[(df["model"].str.contains(query, case=False))]
123
  return filtered_df[leaderboard_table.columns]
@@ -174,13 +184,18 @@ with demo:
174
  show_label=False,
175
  elem_id="search-bar",
176
  )
177
- filter_columns = gr.Radio(
178
  label="⏚ Filter model types",
179
- choices=["all", "🟢 base", "🔶 instruction-tuned", "EXT external-evaluation"],
180
  value="all",
181
  elem_id="filter-columns",
182
  )
183
-
 
 
 
 
 
184
  leaderboard_df = gr.components.Dataframe(
185
  value=df[
186
  [
@@ -210,9 +225,14 @@ with demo:
210
  [hidden_leaderboard_df, leaderboard_df, search_bar],
211
  leaderboard_df,
212
  )
213
- filter_columns.change(
214
- filter_items,
215
- [hidden_leaderboard_df, leaderboard_df, filter_columns],
 
 
 
 
 
216
  leaderboard_df,
217
  )
218
  shown_columns.change(
@@ -229,7 +249,6 @@ with demo:
229
  - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
230
  - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
231
  - `size` is the amount of activated model weight during inference.
232
- - Some instruction-tuned models are marked with 🟢 symbol, as they miss the chat templates in their tokenizer configurations.
233
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
234
  - For more details check the 📝 About section.
235
  - Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
 
109
  return filtered_df
110
 
111
 
112
+ def filter_types(df, leaderboard_table, query):
113
  if query == "all":
114
  return df[leaderboard_table.columns]
115
  else:
 
118
  return filtered_df[leaderboard_table.columns]
119
 
120
 
121
+ def filter_direct_complete(df, leaderboard_table, query):
122
+ if query == "all":
123
+ return df[leaderboard_table.columns]
124
+
125
+ if query == "chat template":
126
+ return df[~df["direct_complete"]][leaderboard_table.columns]
127
+ else:
128
+ return df[df["direct_complete"]][leaderboard_table.columns]
129
+
130
+
131
  def search_table(df, leaderboard_table, query):
132
  filtered_df = df[(df["model"].str.contains(query, case=False))]
133
  return filtered_df[leaderboard_table.columns]
 
184
  show_label=False,
185
  elem_id="search-bar",
186
  )
187
+ filter_types_columns = gr.Radio(
188
  label="⏚ Filter model types",
189
+ choices=["all", "🟢 base", "🔶 instruction-tuned"], #, "EXT external-evaluation"],
190
  value="all",
191
  elem_id="filter-columns",
192
  )
193
+ filter_prompting_columns = gr.Radio(
194
+ label="⏚ Filter prompting",
195
+ choices=["all", "chat template", "direct complete"],
196
+ value="all",
197
+ elem_id="filter-direct-complete",
198
+ )
199
  leaderboard_df = gr.components.Dataframe(
200
  value=df[
201
  [
 
225
  [hidden_leaderboard_df, leaderboard_df, search_bar],
226
  leaderboard_df,
227
  )
228
+ filter_types_columns.change(
229
+ filter_types,
230
+ [hidden_leaderboard_df, leaderboard_df, filter_types_columns],
231
+ leaderboard_df,
232
+ )
233
+ filter_prompting_columns.change(
234
+ filter_direct_complete,
235
+ [hidden_leaderboard_df, leaderboard_df, filter_prompting_columns],
236
  leaderboard_df,
237
  )
238
  shown_columns.change(
 
249
  - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
250
  - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
251
  - `size` is the amount of activated model weight during inference.
 
252
  - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
253
  - For more details check the 📝 About section.
254
  - Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
src/utils.py CHANGED
@@ -24,12 +24,11 @@ def fields(raw_class):
24
  class AutoEvalColumn: # Auto evals column
25
  model_type_symbol = ColumnContent("type", "str", True)
26
  model = ColumnContent("model", "markdown", True)
27
- size = ColumnContent("size", "number", False)
28
  complete_score = ColumnContent("complete", "number", True)
29
  instruct_score = ColumnContent("instruct", "number", True)
30
  elo_mle = ColumnContent("elo_mle", "number", True)
31
  dummy = ColumnContent("model", "str", True)
32
- link = ColumnContent("link", "str", False)
33
 
34
 
35
  def model_hyperlink(link, model_name):
 
24
  class AutoEvalColumn: # Auto evals column
25
  model_type_symbol = ColumnContent("type", "str", True)
26
  model = ColumnContent("model", "markdown", True)
 
27
  complete_score = ColumnContent("complete", "number", True)
28
  instruct_score = ColumnContent("instruct", "number", True)
29
  elo_mle = ColumnContent("elo_mle", "number", True)
30
  dummy = ColumnContent("model", "str", True)
31
+ size = ColumnContent("size", "number", False)
32
 
33
 
34
  def model_hyperlink(link, model_name):