sheonhan commited on
Commit
ffefe11
1 Parent(s): 2102b66

implements search bar

Browse files
Files changed (2) hide show
  1. app.py +55 -20
  2. utils.py +2 -0
app.py CHANGED
@@ -81,16 +81,9 @@ COLS = [
81
  "HellaSwag (10-shot) ⬆️",
82
  "MMLU (5-shot) ⬆️",
83
  "TruthfulQA (0-shot) ⬆️",
 
84
  ]
85
- TYPES = [
86
- "markdown",
87
- "str",
88
- "number",
89
- "number",
90
- "number",
91
- "number",
92
- "number",
93
- ]
94
 
95
  if not IS_PUBLIC:
96
  COLS.insert(2, "8bit")
@@ -115,7 +108,7 @@ def has_nan_values(df, columns):
115
  return df[columns].isna().any(axis=1)
116
 
117
 
118
- def get_leaderboard():
119
  if repo:
120
  print("Pulling evaluation results for the leaderboard.")
121
  repo.git_pull()
@@ -132,6 +125,7 @@ def get_leaderboard():
132
  "HellaSwag (10-shot) ⬆️": 95.3,
133
  "MMLU (5-shot) ⬆️": 86.4,
134
  "TruthfulQA (0-shot) ⬆️": 59.0,
 
135
  }
136
  all_data.append(gpt4_values)
137
  gpt35_values = {
@@ -143,6 +137,7 @@ def get_leaderboard():
143
  "HellaSwag (10-shot) ⬆️": 85.5,
144
  "MMLU (5-shot) ⬆️": 70.0,
145
  "TruthfulQA (0-shot) ⬆️": 47.0,
 
146
  }
147
  all_data.append(gpt35_values)
148
 
@@ -155,6 +150,7 @@ def get_leaderboard():
155
  "HellaSwag (10-shot) ⬆️": 25.0,
156
  "MMLU (5-shot) ⬆️": 25.0,
157
  "TruthfulQA (0-shot) ⬆️": 25.0,
 
158
  }
159
 
160
  all_data.append(base_line)
@@ -168,7 +164,7 @@ def get_leaderboard():
168
  return df
169
 
170
 
171
- def get_eval_table():
172
  if repo:
173
  print("Pulling changes for the evaluation queue.")
174
  repo.git_pull()
@@ -216,8 +212,13 @@ def get_eval_table():
216
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
217
 
218
 
219
- leaderboard = get_leaderboard()
220
- finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
 
 
 
 
 
221
 
222
 
223
  def is_model_on_hub(model_name, revision) -> bool:
@@ -294,9 +295,18 @@ def add_new_eval(
294
 
295
 
296
  def refresh():
297
- leaderboard = get_leaderboard()
298
- finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
299
- return leaderboard, finished_eval_queue, running_eval_queue, pending_eval_queue
 
 
 
 
 
 
 
 
 
300
 
301
 
302
  custom_css = """
@@ -324,8 +334,20 @@ custom_css = """
324
  margin: 6px;
325
  transform: scale(1.3);
326
  }
 
 
 
 
 
 
327
  """
328
 
 
 
 
 
 
 
329
  demo = gr.Blocks(css=custom_css)
330
  with demo:
331
  gr.HTML(TITLE)
@@ -343,22 +365,35 @@ with demo:
343
  with gr.Accordion("✨ CHANGELOG", open=False):
344
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
345
 
 
 
346
  leaderboard_table = gr.components.Dataframe(
347
- value=leaderboard, headers=COLS, datatype=TYPES, max_rows=5
 
 
 
 
 
 
 
 
 
 
 
348
  )
349
 
350
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
351
 
352
  with gr.Accordion("✅ Finished Evaluations", open=False):
353
  finished_eval_table = gr.components.Dataframe(
354
- value=finished_eval_queue,
355
  headers=EVAL_COLS,
356
  datatype=EVAL_TYPES,
357
  max_rows=5,
358
  )
359
  with gr.Accordion("🔄 Running Evaluation Queue", open=False):
360
  running_eval_table = gr.components.Dataframe(
361
- value=running_eval_queue,
362
  headers=EVAL_COLS,
363
  datatype=EVAL_TYPES,
364
  max_rows=5,
@@ -366,7 +401,7 @@ with demo:
366
 
367
  with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
368
  pending_eval_table = gr.components.Dataframe(
369
- value=pending_eval_queue,
370
  headers=EVAL_COLS,
371
  datatype=EVAL_TYPES,
372
  max_rows=5,
 
81
  "HellaSwag (10-shot) ⬆️",
82
  "MMLU (5-shot) ⬆️",
83
  "TruthfulQA (0-shot) ⬆️",
84
+ "model_name_for_query", # dummy column to implement search bar (hidden by custom CSS)
85
  ]
86
+ TYPES = ["markdown", "str", "number", "number", "number", "number", "number", "str"]
 
 
 
 
 
 
 
 
87
 
88
  if not IS_PUBLIC:
89
  COLS.insert(2, "8bit")
 
108
  return df[columns].isna().any(axis=1)
109
 
110
 
111
+ def get_leaderboard_df():
112
  if repo:
113
  print("Pulling evaluation results for the leaderboard.")
114
  repo.git_pull()
 
125
  "HellaSwag (10-shot) ⬆️": 95.3,
126
  "MMLU (5-shot) ⬆️": 86.4,
127
  "TruthfulQA (0-shot) ⬆️": 59.0,
128
+ "model_name_for_query": "GPT-4",
129
  }
130
  all_data.append(gpt4_values)
131
  gpt35_values = {
 
137
  "HellaSwag (10-shot) ⬆️": 85.5,
138
  "MMLU (5-shot) ⬆️": 70.0,
139
  "TruthfulQA (0-shot) ⬆️": 47.0,
140
+ "model_name_for_query": "GPT-3.5",
141
  }
142
  all_data.append(gpt35_values)
143
 
 
150
  "HellaSwag (10-shot) ⬆️": 25.0,
151
  "MMLU (5-shot) ⬆️": 25.0,
152
  "TruthfulQA (0-shot) ⬆️": 25.0,
153
+ "model_name_for_query": "baseline",
154
  }
155
 
156
  all_data.append(base_line)
 
164
  return df
165
 
166
 
167
+ def get_evaluation_queue_df():
168
  if repo:
169
  print("Pulling changes for the evaluation queue.")
170
  repo.git_pull()
 
212
  return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
213
 
214
 
215
+ original_df = get_leaderboard_df()
216
+ leaderboard_df = original_df.copy()
217
+ (
218
+ finished_eval_queue_df,
219
+ running_eval_queue_df,
220
+ pending_eval_queue_df,
221
+ ) = get_evaluation_queue_df()
222
 
223
 
224
  def is_model_on_hub(model_name, revision) -> bool:
 
295
 
296
 
297
  def refresh():
298
+ leaderboard_df = get_leaderboard_df()
299
+ (
300
+ finished_eval_queue_df,
301
+ running_eval_queue_df,
302
+ pending_eval_queue_df,
303
+ ) = get_evaluation_queue_df()
304
+ return (
305
+ leaderboard_df,
306
+ finished_eval_queue_df,
307
+ running_eval_queue_df,
308
+ pending_eval_queue_df,
309
+ )
310
 
311
 
312
  custom_css = """
 
334
  margin: 6px;
335
  transform: scale(1.3);
336
  }
337
+
338
+ /* Hides the final column */
339
+ table td:last-child,
340
+ table th:last-child {
341
+ display: none;
342
+ }
343
  """
344
 
345
+
346
+ def search_table(df, query):
347
+ filtered_df = df[df["model_name_for_query"].str.contains(query, case=False)]
348
+ return filtered_df
349
+
350
+
351
  demo = gr.Blocks(css=custom_css)
352
  with demo:
353
  gr.HTML(TITLE)
 
365
  with gr.Accordion("✨ CHANGELOG", open=False):
366
  changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
367
 
368
+ search_bar = gr.Textbox(label="Search bar")
369
+
370
  leaderboard_table = gr.components.Dataframe(
371
+ value=leaderboard_df, headers=COLS, datatype=TYPES, max_rows=5
372
+ )
373
+
374
+ # Dummy leaderboard for handling the case when the user uses backspace key
375
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
376
+ value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
377
+ )
378
+
379
+ search_bar.change(
380
+ search_table,
381
+ [hidden_leaderboard_table_for_search, search_bar],
382
+ leaderboard_table,
383
  )
384
 
385
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
386
 
387
  with gr.Accordion("✅ Finished Evaluations", open=False):
388
  finished_eval_table = gr.components.Dataframe(
389
+ value=finished_eval_queue_df,
390
  headers=EVAL_COLS,
391
  datatype=EVAL_TYPES,
392
  max_rows=5,
393
  )
394
  with gr.Accordion("🔄 Running Evaluation Queue", open=False):
395
  running_eval_table = gr.components.Dataframe(
396
+ value=running_eval_queue_df,
397
  headers=EVAL_COLS,
398
  datatype=EVAL_TYPES,
399
  max_rows=5,
 
401
 
402
  with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
403
  pending_eval_table = gr.components.Dataframe(
404
+ value=pending_eval_queue_df,
405
  headers=EVAL_COLS,
406
  datatype=EVAL_TYPES,
407
  max_rows=5,
utils.py CHANGED
@@ -71,6 +71,8 @@ class EvalResult:
71
  data_dict["eval_name"] = self.eval_name
72
  data_dict["8bit"] = self.is_8bit
73
  data_dict["Model"] = make_clickable_model(base_model)
 
 
74
  data_dict["Revision"] = self.revision
75
  data_dict["Average ⬆️"] = round(
76
  sum([v for k, v in self.results.items()]) / 4.0, 1
 
71
  data_dict["eval_name"] = self.eval_name
72
  data_dict["8bit"] = self.is_8bit
73
  data_dict["Model"] = make_clickable_model(base_model)
74
+ # dummy column to implement search bar (hidden by custom CSS)
75
+ data_dict["model_name_for_query"] = base_model
76
  data_dict["Revision"] = self.revision
77
  data_dict["Average ⬆️"] = round(
78
  sum([v for k, v in self.results.items()]) / 4.0, 1