Alina Lozovskaia commited on
Commit
a03f0fa
1 Parent(s): a5d34d3

ported new app.py [wip]

Browse files
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import os
2
- import time
3
  import logging
 
4
  import gradio as gr
5
- import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
 
8
  from gradio_space_ci import enable_space_ci
9
 
10
  from src.display.about import (
@@ -49,14 +50,12 @@ from src.submission.submit import add_new_eval
49
  from src.tools.collections import update_collections
50
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
51
 
52
-
53
  # Configure logging
54
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
 
56
  # Start ephemeral Spaces on PRs (see config in README.md)
57
  enable_space_ci()
58
 
59
-
60
  def restart_space():
61
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
62
 
@@ -142,140 +141,7 @@ def load_and_create_plots():
142
  plot_df = create_plot_df(create_scores_df(raw_data))
143
  return plot_df
144
 
145
-
146
- # Searching and filtering
147
- def update_table(
148
- hidden_df: pd.DataFrame,
149
- columns: list,
150
- type_query: list,
151
- precision_query: str,
152
- size_query: list,
153
- hide_models: list,
154
- query: str,
155
- ):
156
- filtered_df = filter_models(
157
- df=hidden_df,
158
- type_query=type_query,
159
- size_query=size_query,
160
- precision_query=precision_query,
161
- hide_models=hide_models,
162
- )
163
- filtered_df = filter_queries(query, filtered_df)
164
- df = select_columns(filtered_df, columns)
165
- return df
166
-
167
-
168
- def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
169
- query = request.query_params.get("query") or ""
170
- return (
171
- query,
172
- query,
173
- ) # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
174
-
175
-
176
- def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
177
- return df[(df[AutoEvalColumn.fullname.name].str.contains(query, case=False, na=False))]
178
-
179
- def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
180
- return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
181
-
182
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
183
- always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
184
- dummy_col = [AutoEvalColumn.fullname.name]
185
- filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
186
- return filtered_df
187
-
188
- def filter_queries(query: str, df: pd.DataFrame):
189
- tmp_result_df = []
190
-
191
- # Empty query return the same df
192
- if query == "":
193
- return df
194
-
195
- # all_queries = [q.strip() for q in query.split(";")]
196
- # license_queries = []
197
- all_queries = [q.strip() for q in query.split(";") if q.strip() != ""]
198
- model_queries = [q for q in all_queries if not q.startswith("licence")]
199
- license_queries_raw = [q for q in all_queries if q.startswith("license")]
200
- license_queries = [
201
- q.replace("license:", "").strip() for q in license_queries_raw if q.replace("license:", "").strip() != ""
202
- ]
203
-
204
- # Handling model name search
205
- for query in model_queries:
206
- tmp_df = search_model(df, query)
207
- if len(tmp_df) > 0:
208
- tmp_result_df.append(tmp_df)
209
-
210
- if not tmp_result_df and not license_queries:
211
- # Nothing is found, no license_queries -> return empty df
212
- return pd.DataFrame(columns=df.columns)
213
-
214
- if tmp_result_df:
215
- df = pd.concat(tmp_result_df)
216
- df = df.drop_duplicates(
217
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
218
- )
219
-
220
- if not license_queries:
221
- return df
222
-
223
- # Handling license search
224
- tmp_result_df = []
225
- for query in license_queries:
226
- tmp_df = search_license(df, query)
227
- if len(tmp_df) > 0:
228
- tmp_result_df.append(tmp_df)
229
-
230
- if not tmp_result_df:
231
- # Nothing is found, return empty df
232
- return pd.DataFrame(columns=df.columns)
233
-
234
- df = pd.concat(tmp_result_df)
235
- df = df.drop_duplicates(
236
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
237
- )
238
-
239
- return df
240
-
241
-
242
- def filter_models(
243
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, hide_models: list
244
- ) -> pd.DataFrame:
245
- # Show all models
246
- if "Private or deleted" in hide_models:
247
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
248
- else:
249
- filtered_df = df
250
-
251
- if "Contains a merge/moerge" in hide_models:
252
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
253
-
254
- if "MoE" in hide_models:
255
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
256
-
257
- if "Flagged" in hide_models:
258
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
259
-
260
- type_emoji = [t[0] for t in type_query]
261
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
262
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
263
-
264
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
265
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
266
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
267
- filtered_df = filtered_df.loc[mask]
268
-
269
- return filtered_df
270
-
271
-
272
- leaderboard_df = filter_models(
273
- df=leaderboard_df,
274
- type_query=[t.to_str(" : ") for t in ModelType],
275
- size_query=list(NUMERIC_INTERVALS.keys()),
276
- precision_query=[i.value.name for i in Precision],
277
- hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
278
- )
279
 
280
  demo = gr.Blocks(css=custom_css)
281
  with demo:
@@ -284,135 +150,40 @@ with demo:
284
 
285
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
286
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
287
- with gr.Row():
288
- with gr.Column():
289
- with gr.Row():
290
- search_bar = gr.Textbox(
291
- placeholder="🔍 Search models or licenses (e.g., 'model_name; license: MIT') and press ENTER...",
292
- show_label=False,
293
- elem_id="search-bar",
294
- )
295
- with gr.Row():
296
- shown_columns = gr.CheckboxGroup(
297
- choices=[
298
- c.name
299
- for c in fields(AutoEvalColumn)
300
- if not c.hidden and not c.never_hidden and not c.dummy
301
- ],
302
- value=[
303
- c.name
304
- for c in fields(AutoEvalColumn)
305
- if c.displayed_by_default and not c.hidden and not c.never_hidden
306
- ],
307
- label="Select columns to show",
308
- elem_id="column-select",
309
- interactive=True,
310
- )
311
- with gr.Row():
312
- hide_models = gr.CheckboxGroup(
313
- label="Hide models",
314
- choices=["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
315
- value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
316
- interactive=True,
317
- )
318
- with gr.Column(min_width=320):
319
- # with gr.Box(elem_id="box-filter"):
320
- filter_columns_type = gr.CheckboxGroup(
321
- label="Model types",
322
- choices=[t.to_str() for t in ModelType],
323
- value=[t.to_str() for t in ModelType],
324
- interactive=True,
325
- elem_id="filter-columns-type",
326
- )
327
- filter_columns_precision = gr.CheckboxGroup(
328
- label="Precision",
329
- choices=[i.value.name for i in Precision],
330
- value=[i.value.name for i in Precision],
331
- interactive=True,
332
- elem_id="filter-columns-precision",
333
- )
334
- filter_columns_size = gr.CheckboxGroup(
335
- label="Model sizes (in billions of parameters)",
336
- choices=list(NUMERIC_INTERVALS.keys()),
337
- value=list(NUMERIC_INTERVALS.keys()),
338
- interactive=True,
339
- elem_id="filter-columns-size",
340
- )
341
-
342
- leaderboard_table = gr.components.Dataframe(
343
- value=leaderboard_df[
344
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
345
- + shown_columns.value
346
- + [AutoEvalColumn.fullname.name]
347
  ],
348
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
349
- datatype=TYPES,
350
- elem_id="leaderboard-table",
351
- interactive=False,
352
- visible=True,
353
- )
354
-
355
- # Dummy leaderboard for handling the case when the user uses backspace key
356
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
357
- value=original_df[COLS],
358
- headers=COLS,
359
- datatype=TYPES,
360
- visible=False,
361
- )
362
- search_bar.submit(
363
- update_table,
364
- [
365
- hidden_leaderboard_table_for_search,
366
- shown_columns,
367
- filter_columns_type,
368
- filter_columns_precision,
369
- filter_columns_size,
370
- hide_models,
371
- search_bar,
372
  ],
373
- leaderboard_table,
374
- )
375
-
376
- # Define a hidden component that will trigger a reload only if a query parameter has been set
377
- hidden_search_bar = gr.Textbox(value="", visible=False)
378
- hidden_search_bar.change(
379
- update_table,
380
- [
381
- hidden_leaderboard_table_for_search,
382
- shown_columns,
383
- filter_columns_type,
384
- filter_columns_precision,
385
- filter_columns_size,
386
- hide_models,
387
- search_bar,
388
  ],
389
- leaderboard_table,
390
  )
391
- # Check query parameter once at startup and update search bar + hidden component
392
- demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
393
-
394
- for selector in [
395
- shown_columns,
396
- filter_columns_type,
397
- filter_columns_precision,
398
- filter_columns_size,
399
- hide_models,
400
- ]:
401
- selector.change(
402
- update_table,
403
- [
404
- hidden_leaderboard_table_for_search,
405
- shown_columns,
406
- filter_columns_type,
407
- filter_columns_precision,
408
- filter_columns_size,
409
- hide_models,
410
- search_bar,
411
- ],
412
- leaderboard_table,
413
- queue=True,
414
- )
415
-
416
  with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
417
  with gr.Row():
418
  with gr.Column():
@@ -543,4 +314,4 @@ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
543
  scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
544
  scheduler.start()
545
 
546
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import os
2
+ import pandas as pd
3
  import logging
4
+ import time
5
  import gradio as gr
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
9
  from gradio_space_ci import enable_space_ci
10
 
11
  from src.display.about import (
 
50
  from src.tools.collections import update_collections
51
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
52
 
 
53
  # Configure logging
54
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
 
56
  # Start ephemeral Spaces on PRs (see config in README.md)
57
  enable_space_ci()
58
 
 
59
  def restart_space():
60
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
61
 
 
141
  plot_df = create_plot_df(create_scores_df(raw_data))
142
  return plot_df
143
 
144
+ print(leaderboard_df.columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  demo = gr.Blocks(css=custom_css)
147
  with demo:
 
150
 
151
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
152
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
153
+ leaderboard = Leaderboard(
154
+ value=leaderboard_df,
155
+ datatype=[c.type for c in fields(AutoEvalColumn)],
156
+ select_columns=SelectColumns(
157
+ default_selection=[
158
+ c.name
159
+ for c in fields(AutoEvalColumn)
160
+ if c.displayed_by_default
161
+ ],
162
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
163
+ label="Select Columns to Display:",
164
+ ),
165
+ search_columns=[
166
+ AutoEvalColumn.model.name,
167
+ AutoEvalColumn.fullname.name,
168
+ AutoEvalColumn.license.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  ],
170
+ hide_columns=[
171
+ c.name
172
+ for c in fields(AutoEvalColumn)
173
+ if c.hidden
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  ],
175
+ filter_columns=[
176
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
177
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
178
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
179
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True),
180
+ ColumnFilter(AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True),
181
+ ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
182
+ ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
 
 
 
 
 
 
 
183
  ],
184
+ bool_checkboxgroup_label="Hide models"
185
  )
186
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
188
  with gr.Row():
189
  with gr.Column():
 
314
  scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
315
  scheduler.start()
316
 
317
+ demo.queue(default_concurrency_limit=40).launch()
pyproject.toml CHANGED
@@ -47,6 +47,7 @@ gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci"
47
  gradio = "4.9.0"
48
  isort = "^5.13.2"
49
  ruff = "^0.3.5"
 
50
 
51
  [build-system]
52
  requires = ["poetry-core"]
 
47
  gradio = "4.9.0"
48
  isort = "^5.13.2"
49
  ruff = "^0.3.5"
50
+ gradio-leaderboard = "^0.0.7"
51
 
52
  [build-system]
53
  requires = ["poetry-core"]
requirements.txt CHANGED
@@ -13,4 +13,5 @@ sentencepiece
13
  tqdm==4.65.0
14
  transformers==4.40.0
15
  tokenizers>=0.15.0
16
- gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
 
 
13
  tqdm==4.65.0
14
  transformers==4.40.0
15
  tokenizers>=0.15.0
16
+ gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
17
+ gradio_leaderboard
src/display/utils.py CHANGED
@@ -89,7 +89,7 @@ auto_eval_column_dict.append(
89
  ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
90
  )
91
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
92
- auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
93
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
94
  # Dummy column for the search bar (hidden by the custom CSS)
95
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
@@ -123,7 +123,7 @@ baseline_row = {
123
  AutoEvalColumn.gsm8k.name: 0.21,
124
  AutoEvalColumn.fullname.name: "baseline",
125
  AutoEvalColumn.model_type.name: "",
126
- AutoEvalColumn.flagged.name: False,
127
  }
128
 
129
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
@@ -148,7 +148,7 @@ human_baseline_row = {
148
  AutoEvalColumn.gsm8k.name: 100,
149
  AutoEvalColumn.fullname.name: "human_baseline",
150
  AutoEvalColumn.model_type.name: "",
151
- AutoEvalColumn.flagged.name: False,
152
  }
153
 
154
 
 
89
  ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
90
  )
91
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
92
+ auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
93
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
94
  # Dummy column for the search bar (hidden by the custom CSS)
95
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
 
123
  AutoEvalColumn.gsm8k.name: 0.21,
124
  AutoEvalColumn.fullname.name: "baseline",
125
  AutoEvalColumn.model_type.name: "",
126
+ AutoEvalColumn.not_flagged.name: False,
127
  }
128
 
129
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 
148
  AutoEvalColumn.gsm8k.name: 100,
149
  AutoEvalColumn.fullname.name: "human_baseline",
150
  AutoEvalColumn.model_type.name: "",
151
+ AutoEvalColumn.not_flagged.name: False,
152
  }
153
 
154
 
src/leaderboard/filter_models.py CHANGED
@@ -133,11 +133,16 @@ DO_NOT_SUBMIT_MODELS = [
133
  def flag_models(leaderboard_data: list[dict]):
134
  """Flags models based on external criteria or flagged status."""
135
  for model_data in leaderboard_data:
136
- # Merges and moes are flagged automatically
137
- if model_data[AutoEvalColumn.flagged.name]:
138
- flag_key = "merged"
139
- else:
140
  flag_key = model_data[AutoEvalColumn.fullname.name]
 
 
 
 
 
 
 
141
  if flag_key in FLAGGED_MODELS:
142
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
143
  issue_link = model_hyperlink(
@@ -147,9 +152,9 @@ def flag_models(leaderboard_data: list[dict]):
147
  model_data[AutoEvalColumn.model.name] = (
148
  f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
149
  )
150
- model_data[AutoEvalColumn.flagged.name] = True
151
  else:
152
- model_data[AutoEvalColumn.flagged.name] = False
153
 
154
 
155
  def remove_forbidden_models(leaderboard_data: list[dict]):
 
133
  def flag_models(leaderboard_data: list[dict]):
134
  """Flags models based on external criteria or flagged status."""
135
  for model_data in leaderboard_data:
136
+ # If a model is not flagged, use its "fullname" as a key
137
+ if model_data[AutoEvalColumn.not_flagged.name]:
 
 
138
  flag_key = model_data[AutoEvalColumn.fullname.name]
139
+ else:
140
+ # Merges and moes are flagged
141
+ flag_key = "merged"
142
+
143
+ print(f"model check: {flag_key}")
144
+
145
+ # Reverse the logic: Check for non-flagged models instead
146
  if flag_key in FLAGGED_MODELS:
147
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
148
  issue_link = model_hyperlink(
 
152
  model_data[AutoEvalColumn.model.name] = (
153
  f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
154
  )
155
+ model_data[AutoEvalColumn.not_flagged.name] = False
156
  else:
157
+ model_data[AutoEvalColumn.not_flagged.name] = True
158
 
159
 
160
  def remove_forbidden_models(leaderboard_data: list[dict]):
src/leaderboard/read_evals.py CHANGED
@@ -37,7 +37,7 @@ class EvalResult:
37
  date: str = "" # submission date of request file
38
  still_on_hub: bool = True
39
  is_merge: bool = False
40
- flagged: bool = False
41
  status: str = "FINISHED"
42
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
  tags: List[str] = field(default_factory=list)
@@ -164,7 +164,7 @@ class EvalResult:
164
  self.tags = file_dict.get("tags", [])
165
 
166
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
- self.flagged = "flagged" in self.tags
168
 
169
 
170
  def to_dict(self):
@@ -185,9 +185,9 @@ class EvalResult:
185
  AutoEvalColumn.likes.name: self.likes,
186
  AutoEvalColumn.params.name: self.num_params,
187
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
188
- AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
189
- AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
190
- AutoEvalColumn.flagged.name: self.flagged,
191
  }
192
 
193
  for task in Tasks:
 
37
  date: str = "" # submission date of request file
38
  still_on_hub: bool = True
39
  is_merge: bool = False
40
+ not_flagged: bool = False
41
  status: str = "FINISHED"
42
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
  tags: List[str] = field(default_factory=list)
 
164
  self.tags = file_dict.get("tags", [])
165
 
166
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
+ self.not_flagged = not (any("flagged" in tag for tag in self.tags))
168
 
169
 
170
  def to_dict(self):
 
185
  AutoEvalColumn.likes.name: self.likes,
186
  AutoEvalColumn.params.name: self.num_params,
187
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
188
+ AutoEvalColumn.merged.name: not( "merge" in self.tags if self.tags else False),
189
+ AutoEvalColumn.moe.name: not ( ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()) ,
190
+ AutoEvalColumn.not_flagged.name: self.not_flagged,
191
  }
192
 
193
  for task in Tasks:
src/tools/plots.py CHANGED
@@ -34,7 +34,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
34
  # We ignore models that are flagged/no longer on the hub/not finished
35
  to_ignore = (
36
  not row["still_on_hub"]
37
- or row["flagged"]
38
  or current_model in FLAGGED_MODELS
39
  or row["status"] != "FINISHED"
40
  )
 
34
  # We ignore models that are flagged/no longer on the hub/not finished
35
  to_ignore = (
36
  not row["still_on_hub"]
37
+ or row["not_flagged"]
38
  or current_model in FLAGGED_MODELS
39
  or row["status"] != "FINISHED"
40
  )