hsaest commited on
Commit
4283eb3
1 Parent(s): b1af7e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -26
app.py CHANGED
@@ -35,8 +35,8 @@ os.makedirs("scored", exist_ok=True)
35
 
36
  # # Display the results
37
  eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
38
- def get_dataframe_from_results(eval_results, split):
39
- local_df = eval_results[split]
40
  local_df = local_df.remove_columns(["Mail"])
41
  df = pd.DataFrame(local_df)
42
  df = df.sort_values(by=["Final Pass Rate"], ascending=False)
@@ -45,9 +45,10 @@ def get_dataframe_from_results(eval_results, split):
45
  return df
46
 
47
 
48
- eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
49
- eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
50
-
 
51
 
52
 
53
  # def restart_space():
@@ -67,6 +68,7 @@ def add_new_eval(
67
  val_or_test: str,
68
  eval_mode: str,
69
  model: str,
 
70
  planning_strategy: str,
71
  organization: str,
72
  mail: str,
@@ -86,7 +88,7 @@ def add_new_eval(
86
  api.upload_file(
87
  repo_id=RESULTS_DATASET,
88
  path_or_fileobj=path_to_file.name,
89
- path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
90
  repo_type="dataset",
91
  token=TOKEN
92
  )
@@ -94,14 +96,14 @@ def add_new_eval(
94
  # Compute score
95
  file_path = path_to_file.name
96
  result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
97
- with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file:
98
  scored_file.write(json.dumps(result) + "\n")
99
 
100
  # Save scored file
101
  api.upload_file(
102
  repo_id=RESULTS_DATASET,
103
- path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl",
104
- path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
105
  repo_type="dataset",
106
  token=TOKEN
107
  )
@@ -109,6 +111,7 @@ def add_new_eval(
109
  # Actual submission
110
  eval_entry = {
111
  "Model": model,
 
112
  "Planning Strategy": planning_strategy,
113
  "Organization": organization,
114
  "Mail": mail,
@@ -119,21 +122,23 @@ def add_new_eval(
119
  "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
120
  "Final Pass Rate":result['Final Pass Rate']
121
  }
122
-
123
- eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
124
 
125
  print(eval_results)
126
 
127
  eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
128
 
129
- return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
130
 
131
 
132
  def refresh():
133
  eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
134
- eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
135
- eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
136
- return eval_dataframe_val, eval_dataframe_test
 
 
137
 
138
  # def upload_file(files):
139
  # file_paths = [file.name for file in files]
@@ -145,13 +150,22 @@ with demo:
145
  gr.HTML(TITLE)
146
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
147
 
148
- with gr.Tab("Results: Validation"):
149
- leaderboard_table_val = gr.components.Dataframe(
150
- value=eval_dataframe_val, interactive=False,
 
 
 
 
 
 
 
 
 
151
  )
152
- with gr.Tab("Results: Test"):
153
- leaderboard_table_test = gr.components.Dataframe(
154
- value=eval_dataframe_test, interactive=False,
155
  )
156
 
157
  refresh_button = gr.Button("Refresh")
@@ -159,8 +173,10 @@ with demo:
159
  refresh,
160
  inputs=[],
161
  outputs=[
162
- leaderboard_table_val,
163
- leaderboard_table_test,
 
 
164
  ],
165
  )
166
  with gr.Accordion("Submit a new file for evaluation"):
@@ -169,6 +185,7 @@ with demo:
169
  level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
170
  eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
171
  model = gr.Textbox(label="Foundation Model")
 
172
  planning_strategy = gr.Textbox(label="Planning Strategy")
173
  with gr.Column():
174
  organization = gr.Textbox(label="Organization")
@@ -184,6 +201,7 @@ with demo:
184
  level_of_test,
185
  eval_mode,
186
  model,
 
187
  planning_strategy,
188
  organization,
189
  mail,
@@ -192,8 +210,6 @@ with demo:
192
  submission_result,
193
  )
194
 
195
- # scheduler = BackgroundScheduler()
196
- # scheduler.add_job(restart_space, "interval", seconds=3600)
197
- # scheduler.start()
198
  demo.launch(debug=True)
199
 
 
 
35
 
36
  # # Display the results
37
  eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
38
+ def get_dataframe_from_results(eval_results, split, mode):
39
+ local_df = eval_results[f'{split}_{mode}']
40
  local_df = local_df.remove_columns(["Mail"])
41
  df = pd.DataFrame(local_df)
42
  df = df.sort_values(by=["Final Pass Rate"], ascending=False)
 
45
  return df
46
 
47
 
48
+ eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
49
+ eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
50
+ eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
51
+ eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
52
 
53
 
54
  # def restart_space():
 
68
  val_or_test: str,
69
  eval_mode: str,
70
  model: str,
71
+ tooluse_strategy: str,
72
  planning_strategy: str,
73
  organization: str,
74
  mail: str,
 
88
  api.upload_file(
89
  repo_id=RESULTS_DATASET,
90
  path_or_fileobj=path_to_file.name,
91
+ path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
92
  repo_type="dataset",
93
  token=TOKEN
94
  )
 
96
  # Compute score
97
  file_path = path_to_file.name
98
  result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
99
+ with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl", "w") as scored_file:
100
  scored_file.write(json.dumps(result) + "\n")
101
 
102
  # Save scored file
103
  api.upload_file(
104
  repo_id=RESULTS_DATASET,
105
+ path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl",
106
+ path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
107
  repo_type="dataset",
108
  token=TOKEN
109
  )
 
111
  # Actual submission
112
  eval_entry = {
113
  "Model": model,
114
+ "Tool-use Strategy": tooluse_strategy,
115
  "Planning Strategy": planning_strategy,
116
  "Organization": organization,
117
  "Mail": mail,
 
122
  "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
123
  "Final Pass Rate":result['Final Pass Rate']
124
  }
125
+ eval_mode = eval_mode.replace('-','')
126
+ eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry)
127
 
128
  print(eval_results)
129
 
130
  eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
131
 
132
+ return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} | Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).")
133
 
134
 
135
  def refresh():
136
  eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
137
+ eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
138
+ eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
139
+ eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
140
+ eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
141
+ return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
142
 
143
  # def upload_file(files):
144
  # file_paths = [file.name for file in files]
 
150
  gr.HTML(TITLE)
151
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
152
 
153
+ with gr.Tab("Results: Validation | Two-Stage "):
154
+ leaderboard_table_val_twostage = gr.components.Dataframe(
155
+ value=eval_dataframe_val_twostage, interactive=False,
156
+ )
157
+ with gr.Tab("Results: Validation | Sole-Planning"):
158
+ leaderboard_table_val_soleplanning = gr.components.Dataframe(
159
+ value=eval_dataframe_val_soleplanning, interactive=False,
160
+ )
161
+
162
+ with gr.Tab("Results: Test | Two-Stage "):
163
+ leaderboard_table_test_twostage = gr.components.Dataframe(
164
+ value=eval_dataframe_test_twostage, interactive=False,
165
  )
166
+ with gr.Tab("Results: Test | Sole-Planning"):
167
+ leaderboard_table_test_soleplanning = gr.components.Dataframe(
168
+ value=eval_dataframe_test_soleplanning, interactive=False,
169
  )
170
 
171
  refresh_button = gr.Button("Refresh")
 
173
  refresh,
174
  inputs=[],
175
  outputs=[
176
+ leaderboard_table_val_twostage,
177
+ leaderboard_table_val_soleplanning,
178
+ leaderboard_table_test_twostage,
179
+ leaderboard_table_test_soleplanning,
180
  ],
181
  )
182
  with gr.Accordion("Submit a new file for evaluation"):
 
185
  level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
186
  eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
187
  model = gr.Textbox(label="Foundation Model")
188
+ tooluse_strategy = gr.Textbox(label="Tool-use Strategy")
189
  planning_strategy = gr.Textbox(label="Planning Strategy")
190
  with gr.Column():
191
  organization = gr.Textbox(label="Organization")
 
201
  level_of_test,
202
  eval_mode,
203
  model,
204
+ tooluse_strategy,
205
  planning_strategy,
206
  organization,
207
  mail,
 
210
  submission_result,
211
  )
212
 
 
 
 
213
  demo.launch(debug=True)
214
 
215
+