Ori commited on
Commit
12ca829
1 Parent(s): c0479ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -181
app.py CHANGED
@@ -1,204 +1,244 @@
 
 
 
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
 
 
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
 
 
 
 
 
 
 
 
193
  citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
  lines=20,
197
  elem_id="citation-button",
198
- show_copy_button=True,
199
  )
200
 
 
 
 
201
  scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ from email.utils import parseaddr
5
+
6
  import gradio as gr
 
7
  import pandas as pd
8
+ from datasets import load_dataset
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
+ from huggingface_hub import HfApi
11
+ from content import format_error, format_warning, format_log, TITLE
12
+
13
+
14
+ # Placeholder for the question_scorer function
15
+ def question_scorer(prediction, gold_answer):
16
+ return 1 if prediction == gold_answer else 0
17
+
18
+
19
+ # Constants and Configuration
20
+ TOKEN = os.environ.get("TOKEN", None)
21
+ OWNER = "Ori"
22
+ DATA_DATASET = f"Ori/AssistantBench_V1.0"
23
+ RESULTS_DATASET = f"Ori/results"
24
+ SUBMISSION_DATASET = f"{OWNER}/submissions"
25
+ LEADERBOARD_PATH = f"{OWNER}/leaderboard"
26
+ api = HfApi()
27
+
28
+ YEAR_VERSION = "2024"
29
+
30
+ os.makedirs("scored", exist_ok=True)
31
+
32
+ # Load datasets
33
+ eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
34
+ ignore_verifications=True, trust_remote_code=True)
35
+ gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
36
+ gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
37
 
38
 
39
+ # Function to get dataframe from results
40
+ def get_dataframe_from_results(eval_results, split):
41
+ local_df = eval_results[split]
42
+ df = pd.DataFrame(local_df)
43
+ df = df.sort_values(by=["Accuracy"], ascending=False)
44
+ numeric_cols = [c for c in local_df.column_names if "score" in c]
45
+ df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
46
+ return df
47
+
48
+
49
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
50
+
51
+
52
+ # Function to restart the space
53
  def restart_space():
54
+ api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
58
 
 
 
 
 
59
 
60
+ # Function to add a new evaluation
61
+ def add_new_eval(
62
+ model_name: str,
63
+ model_family: str,
64
+ url: str,
65
+ path_to_file: str,
66
+ organization: str,
67
+ mail: str,
68
+ ):
69
+ _, parsed_mail = parseaddr(mail)
70
+ if "@" not in parsed_mail:
71
+ return format_warning("Please provide a valid email address.")
72
+
73
+ print("Adding new eval")
74
+
75
+ if model_name.lower() in set(
76
+ [m.lower() for m in eval_results["test"]["Model Name"]]) and organization.lower() in set(
77
+ [o.lower() for o in eval_results["test"]["Organization"]]):
78
+ return format_warning("This model has already been submitted.")
79
 
80
+ if path_to_file is None:
81
+ return format_warning("Please attach a file.")
82
+
83
+ api.upload_file(
84
+ repo_id=SUBMISSION_DATASET,
85
+ path_or_fileobj=path_to_file.name,
86
+ path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_raw_{datetime.datetime.today()}.jsonl",
87
+ repo_type="dataset",
88
+ token=TOKEN
89
+ )
90
+
91
+ file_path = path_to_file.name
92
+ scores = 0
93
+ num_questions = 0
94
+ with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
95
+ with open(file_path, 'r') as f:
96
+ for ix, line in enumerate(f):
97
+ try:
98
+ task = json.loads(line)
99
+ except Exception:
100
+ return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
101
+
102
+ if "answer" not in task:
103
+ return format_error(
104
+ f"Line {ix} contains no answer key. Please fix it and resubmit your file.")
105
+
106
+ answer = task["answer"]
107
+ task_id = task["id"]
108
+ if task_id not in gold_answers["test"]:
109
+ return format_error(
110
+ f"{task_id} not found in test set. Are you sure you submitted the correct file?")
111
+
112
+ score = question_scorer(task['answer'], gold_answers["test"][task_id])
113
+ scored_file.write(
114
+ json.dumps({
115
+ "id": task_id,
116
+ "model_answer": answer,
117
+ "score": score
118
+ }) + "\n"
119
+ )
120
+ scores += score
121
+ num_questions += 1
122
+
123
+ api.upload_file(
124
+ repo_id=SUBMISSION_DATASET,
125
+ path_or_fileobj=f"scored/{organization}_{model_name}.jsonl",
126
+ path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_scored_{datetime.datetime.today()}.jsonl",
127
+ repo_type="dataset",
128
+ token=TOKEN
129
+ )
130
 
131
+ eval_entry = {
132
+ "Model Name": model_name,
133
+ "Model Family": model_family,
134
+ "URL": url,
135
+ "Organization": organization,
136
+ "Accuracy": scores / num_questions if num_questions > 0 else 0,
137
+ "Answer rate": scores / num_questions if num_questions > 0 else 0,
138
+ "Precision": scores / num_questions if num_questions > 0 else 0,
139
+ "EM": scores if num_questions > 0 else 0,
140
+ "Cost": 0, # Placeholder for cost, update with actual value if needed
141
+ }
142
+ eval_results["test"] = eval_results["test"].add_item(eval_entry)
143
+ eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
144
+
145
+ return format_log(
146
+ f"Model {model_name} submitted by {organization} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
147
+
148
+
149
+ # Function to refresh the results
150
+ def refresh():
151
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
152
+ ignore_verifications=True, trust_remote_code=True)
153
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
154
+ return eval_dataframe_test
155
+
156
+
157
+ # Gradio interface
158
+ demo = gr.Blocks()
159
+ with demo:
160
+ gr.HTML("<h1>AssistantBench</h1>")
161
+ gr.Markdown("""
162
+ AssistantBench aims to evaluate the ability of web agents to assist with real and time-consuming tasks.
163
+ For more information, please check out our paper or the official website.
164
+ To download AssistantBench, press [here](https://huggingface.co/datasets/Ori/AssistantBench_V1.0).
165
+ """)
166
+
167
+ gr.HTML("<h2>AssistantBench Leaderboard</h2>")
168
+ with gr.Tab("Results: Test"):
169
+ leaderboard_table_test = gr.Dataframe(
170
+ value=eval_dataframe_test, datatype=TYPES, interactive=False,
171
+ column_widths=["20%"]
172
+ )
173
+
174
+ refresh_button = gr.Button("Refresh")
175
+ refresh_button.click(
176
+ refresh,
177
+ inputs=[],
178
+ outputs=[
179
+ leaderboard_table_test,
180
+ ],
181
+ )
182
+
183
+ gr.HTML("<h2>Making a New Submission</h2>")
184
+ with gr.Accordion("Submit a new model for evaluation"):
185
+ with gr.Row():
186
+ gr.Markdown("""
187
+ To make a new submission, upload a predictions file. We support JSONL files with the following format:
188
+ ```
189
+ {"id": "task_id_1", "answer": "Answer 1 from your model"}
190
+ {"id": "task_id_2", "answer": "Answer 2 from your model"}
191
+ ```
192
+ Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py).
193
+ """)
194
+ with gr.Row():
195
  with gr.Column():
196
+ model_name_textbox = gr.Textbox(label="Model Name")
197
+ model_family_textbox = gr.Textbox(label="Model Family")
198
+ url_textbox = gr.Textbox(label="URL to Model Information")
199
+ with gr.Column():
200
+ organization = gr.Textbox(label="Organization")
201
+ mail = gr.Textbox(
202
+ label="Contact Email (will be stored privately & used if there is an issue with your submission)")
203
+ file_output = gr.File()
204
+
205
+ submit_button = gr.Button("Submit Eval")
206
+ submission_result = gr.Markdown()
207
+ submit_button.click(
208
+ add_new_eval,
209
+ [
210
+ model_name_textbox,
211
+ model_family_textbox,
212
+ url_textbox,
213
+ file_output,
214
+ organization,
215
+ mail
216
+ ],
217
+ submission_result,
218
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  with gr.Row():
221
  with gr.Accordion("📙 Citation", open=False):
222
+ citation_text = """@article{yoran-etal-2023-assistantbench,
223
+ title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
224
+ author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
225
+ year={2024},
226
+ eprint={TODO},
227
+ archivePrefix={arXiv},
228
+ primaryClass={cs.CL}
229
+ }"""
230
  citation_button = gr.Textbox(
231
+ value=citation_text,
232
+ label="Citation",
233
  lines=20,
234
  elem_id="citation-button",
235
+ show_copy_button=True
236
  )
237
 
238
+ gr.HTML(
239
+ "<p>We would like to thank the GAIA team on which this leaderboard is based on their template and HuggingFace for hosting the leaderboard.</p>")
240
+
241
  scheduler = BackgroundScheduler()
242
+ scheduler.add_job(restart_space, "interval", seconds=3600)
243
  scheduler.start()
244
+ demo.launch(debug=True)