sheonhan commited on
Commit
35a0978
1 Parent(s): 3f0aca6

Add a baseline

Browse files
Files changed (1) hide show
  1. app.py +18 -6
app.py CHANGED
@@ -1,13 +1,11 @@
1
  import os
2
- import shutil
3
  import numpy as np
4
  import gradio as gr
5
  from huggingface_hub import Repository, HfApi
6
  from transformers import AutoConfig
7
  import json
8
- from apscheduler.schedulers.background import BackgroundScheduler
9
  import pandas as pd
10
- import datetime
11
  from utils import get_eval_results_dicts, make_clickable_model
12
 
13
  # clone / pull the lmeh eval data
@@ -140,6 +138,19 @@ def get_leaderboard():
140
  }
141
  all_data.append(gpt35_values)
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  df = pd.DataFrame.from_records(all_data)
144
  df = df.sort_values(by=["Average ⬆️"], ascending=False)
145
  df = df[COLS]
@@ -323,7 +334,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
323
 
324
  """
325
  )
326
- with gr.Accordion("Finished Evaluations", open=False):
327
  with gr.Row():
328
  finished_eval_table = gr.components.Dataframe(
329
  value=finished_eval_queue,
@@ -331,7 +342,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
331
  datatype=EVAL_TYPES,
332
  max_rows=5,
333
  )
334
- with gr.Accordion("Running Evaluation Queue", open=False):
335
  with gr.Row():
336
  running_eval_table = gr.components.Dataframe(
337
  value=running_eval_queue,
@@ -340,7 +351,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
340
  max_rows=5,
341
  )
342
 
343
- with gr.Accordion("Pending Evaluation Queue", open=False):
344
  with gr.Row():
345
  pending_eval_table = gr.components.Dataframe(
346
  value=pending_eval_queue,
@@ -378,6 +389,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
378
 
379
  with gr.Row():
380
  submit_button = gr.Button("Submit Eval")
 
381
  with gr.Row():
382
  submission_result = gr.Markdown()
383
  submit_button.click(
 
1
  import os
 
2
  import numpy as np
3
  import gradio as gr
4
  from huggingface_hub import Repository, HfApi
5
  from transformers import AutoConfig
6
  import json
 
7
  import pandas as pd
8
+ from content import CHANGELOG_TEXT
9
  from utils import get_eval_results_dicts, make_clickable_model
10
 
11
  # clone / pull the lmeh eval data
 
138
  }
139
  all_data.append(gpt35_values)
140
 
141
+ base_line = {
142
+ "Model": '<p>Baseline</p>',
143
+ "Revision": "N/A",
144
+ "8bit": None,
145
+ "Average ⬆️": 25.0,
146
+ "ARC (25-shot) ⬆️": 25.0,
147
+ "HellaSwag (10-shot) ⬆️": 25.0,
148
+ "MMLU (5-shot) ⬆️": 25.0,
149
+ "TruthfulQA (0-shot) ⬆️": 25.0,
150
+ }
151
+
152
+ all_data.append(base_line)
153
+
154
  df = pd.DataFrame.from_records(all_data)
155
  df = df.sort_values(by=["Average ⬆️"], ascending=False)
156
  df = df[COLS]
 
334
 
335
  """
336
  )
337
+ with gr.Accordion("Finished Evaluations", open=False):
338
  with gr.Row():
339
  finished_eval_table = gr.components.Dataframe(
340
  value=finished_eval_queue,
 
342
  datatype=EVAL_TYPES,
343
  max_rows=5,
344
  )
345
+ with gr.Accordion("🔄 Running Evaluation Queue", open=False):
346
  with gr.Row():
347
  running_eval_table = gr.components.Dataframe(
348
  value=running_eval_queue,
 
351
  max_rows=5,
352
  )
353
 
354
+ with gr.Accordion("Pending Evaluation Queue", open=False):
355
  with gr.Row():
356
  pending_eval_table = gr.components.Dataframe(
357
  value=pending_eval_queue,
 
389
 
390
  with gr.Row():
391
  submit_button = gr.Button("Submit Eval")
392
+
393
  with gr.Row():
394
  submission_result = gr.Markdown()
395
  submit_button.click(