Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
b80bda9
1 Parent(s): c1df819

feat-add-tabs-for-noreranker-0605 (#17)

Browse files

- feat: add tabs for noreranker (cbde34683e23fb24e5aa025c36226defe3bf8ffe)
- fix: fix the update_metric_qa (03a7ba81e8f1938438cc84f3bd075ed6320bae6e)
- feat: add noreranker tabs to long-doc (b7deaabc6f2fbecc29c8c429540154ff1a1e9924)
- chore: update the versioning (6c77182af5ac643222324724dac29d45790849f4)
- chore: rollback the settings (fbcdfeb6d0292777a6dfb5a101e9673ddedd2281)
- chore: clean up (7beeb691a9f0f328ee18658e5d239c1a76a52660)

app.py CHANGED
@@ -11,11 +11,12 @@ from src.about import (
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
  DEFAULT_METRIC
13
  from src.display.css_html_js import custom_css
14
- from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
- from src.utils import update_metric, upload_file, get_default_cols, submit_results
18
- from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, get_noreranker_button, get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table
 
19
  from src.display.gradio_listener import set_listeners
20
 
21
  def restart_space():
@@ -82,6 +83,7 @@ def update_metric_long_doc(
82
 
83
 
84
  demo = gr.Blocks(css=custom_css)
 
85
  with demo:
86
  gr.HTML(TITLE)
87
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
@@ -89,81 +91,107 @@ with demo:
89
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
90
  with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
91
  with gr.Row():
92
- with gr.Column():
93
- # search retrieval models
94
- with gr.Row():
95
- selected_version = get_version_dropdown()
96
- with gr.Row():
97
- search_bar = get_search_bar()
98
- with gr.Row():
99
- selected_rerankings = get_reranking_dropdown(reranking_models)
100
- with gr.Row():
101
- select_noreranker_only_btn = get_noreranker_button()
102
-
103
  with gr.Column(min_width=320):
104
- # select the metric
105
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
106
  # select domain
107
  with gr.Row():
108
  selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
109
  # select language
110
  with gr.Row():
111
  selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
 
 
 
 
 
 
112
  with gr.Row():
113
  show_anonymous = get_anonymous_checkbox()
114
  with gr.Row():
115
  show_revision_and_timestamp = get_revision_and_ts_checkbox()
116
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
119
-
120
- # Dummy leaderboard for handling the case when the user uses backspace key
121
- hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
122
-
123
- set_listeners(
124
- "qa",
125
- leaderboard_table,
126
- hidden_leaderboard_table_for_search,
127
- search_bar,
128
- select_noreranker_only_btn,
129
- selected_domains,
130
- selected_langs,
131
- selected_rerankings,
132
- show_anonymous,
133
- show_revision_and_timestamp,
134
- )
135
-
136
- # set metric listener
137
- selected_metric.change(
138
- update_metric_qa,
139
- [
140
- selected_metric,
141
- selected_domains,
142
- selected_langs,
143
- selected_rerankings,
144
- search_bar,
145
- show_anonymous,
146
- ],
147
- leaderboard_table,
148
- queue=True
149
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
152
  with gr.Row():
153
- with gr.Column():
154
- with gr.Row():
155
- selected_version = get_version_dropdown()
156
- with gr.Row():
157
- search_bar = get_search_bar()
158
- # select reranking model
159
- with gr.Row():
160
- selected_rerankings = get_reranking_dropdown(reranking_models)
161
- with gr.Row():
162
- select_noreranker_only_btn = get_noreranker_button()
163
  with gr.Column(min_width=320):
164
- # select the metric
165
- with gr.Row():
166
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
167
  # select domain
168
  with gr.Row():
169
  selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
@@ -172,48 +200,106 @@ with demo:
172
  selected_langs = get_language_dropdown(
173
  LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
174
  )
 
 
 
 
 
 
175
  with gr.Row():
176
  show_anonymous = get_anonymous_checkbox()
177
  with gr.Row():
178
  show_revision_and_timestamp = get_revision_and_ts_checkbox()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- leaderboard_table = get_leaderboard_table(
181
- leaderboard_df_long_doc, types_long_doc
182
- )
183
-
184
- # Dummy leaderboard for handling the case when the user uses backspace key
185
- hidden_leaderboard_table_for_search =get_leaderboard_table(
186
- original_df_long_doc, types_long_doc, visible=False
187
- )
188
-
189
- set_listeners(
190
- "long-doc",
191
- leaderboard_table,
192
- hidden_leaderboard_table_for_search,
193
- search_bar,
194
- select_noreranker_only_btn,
195
- selected_domains,
196
- selected_langs,
197
- selected_rerankings,
198
- show_anonymous,
199
- show_revision_and_timestamp,
200
- )
201
-
202
- # set metric listener
203
- selected_metric.change(
204
- update_metric_long_doc,
205
- [
206
- selected_metric,
207
- selected_domains,
208
- selected_langs,
209
- selected_rerankings,
210
- search_bar,
211
- show_anonymous,
212
- show_revision_and_timestamp
213
- ],
214
- leaderboard_table,
215
- queue=True
216
- )
217
 
218
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
219
  with gr.Column():
 
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
  DEFAULT_METRIC
13
  from src.display.css_html_js import custom_css
14
+ from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
+ from src.utils import update_metric, upload_file, get_default_cols, submit_results, reset_rank
18
+ from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, \
19
+ get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table, get_noreranking_dropdown
20
  from src.display.gradio_listener import set_listeners
21
 
22
  def restart_space():
 
83
 
84
 
85
  demo = gr.Blocks(css=custom_css)
86
+
87
  with demo:
88
  gr.HTML(TITLE)
89
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
91
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
92
  with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
93
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
94
  with gr.Column(min_width=320):
 
 
95
  # select domain
96
  with gr.Row():
97
  selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
98
  # select language
99
  with gr.Row():
100
  selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
101
+
102
+ with gr.Column():
103
+ with gr.Row():
104
+ selected_version = get_version_dropdown()
105
+ # select the metric
106
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
107
  with gr.Row():
108
  show_anonymous = get_anonymous_checkbox()
109
  with gr.Row():
110
  show_revision_and_timestamp = get_revision_and_ts_checkbox()
111
 
112
+ with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
113
+ with gr.TabItem("Retriever + Reranker", id=10):
114
+ with gr.Row():
115
+ # search retrieval models
116
+ with gr.Column():
117
+ search_bar = get_search_bar()
118
+ # select reranking models
119
+ with gr.Column():
120
+ selected_rerankings = get_reranking_dropdown(reranking_models)
121
+ leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
122
+ # Dummy leaderboard for handling the case when the user uses backspace key
123
+ hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
124
 
125
+ set_listeners(
126
+ "qa",
127
+ leaderboard_table,
128
+ hidden_leaderboard_table_for_search,
129
+ search_bar,
130
+ selected_domains,
131
+ selected_langs,
132
+ selected_rerankings,
133
+ show_anonymous,
134
+ show_revision_and_timestamp,
135
+ )
136
+
137
+ # set metric listener
138
+ selected_metric.change(
139
+ update_metric_qa,
140
+ [
141
+ selected_metric,
142
+ selected_domains,
143
+ selected_langs,
144
+ selected_rerankings,
145
+ search_bar,
146
+ show_anonymous,
147
+ show_revision_and_timestamp,
148
+ ],
149
+ leaderboard_table,
150
+ queue=True
151
+ )
152
+ with gr.TabItem("Retriever Only", id=11):
153
+ with gr.Column():
154
+ search_bar_retriever = get_search_bar()
155
+ selected_noreranker = get_noreranking_dropdown()
156
+ lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
157
+ lb_df_retriever = reset_rank(lb_df_retriever)
158
+ hidden_lb_db_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
159
+ hidden_lb_db_retriever = reset_rank(hidden_lb_db_retriever)
160
+ lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
161
+ # Dummy leaderboard for handling the case when the user uses backspace key
162
+ hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_db_retriever, types_qa, visible=False)
163
+
164
+ set_listeners(
165
+ "qa",
166
+ lb_table_retriever,
167
+ hidden_lb_table_retriever,
168
+ search_bar_retriever,
169
+ selected_domains,
170
+ selected_langs,
171
+ selected_noreranker,
172
+ show_anonymous,
173
+ show_revision_and_timestamp,
174
+ )
175
+
176
+ # set metric listener
177
+ selected_metric.change(
178
+ update_metric_qa,
179
+ [
180
+ selected_metric,
181
+ selected_domains,
182
+ selected_langs,
183
+ selected_noreranker,
184
+ search_bar_retriever,
185
+ show_anonymous,
186
+ show_revision_and_timestamp,
187
+ ],
188
+ lb_table_retriever,
189
+ queue=True
190
+ )
191
 
192
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
193
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
194
  with gr.Column(min_width=320):
 
 
 
195
  # select domain
196
  with gr.Row():
197
  selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
 
200
  selected_langs = get_language_dropdown(
201
  LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
202
  )
203
+ with gr.Column():
204
+ with gr.Row():
205
+ selected_version = get_version_dropdown()
206
+ # select the metric
207
+ with gr.Row():
208
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
209
  with gr.Row():
210
  show_anonymous = get_anonymous_checkbox()
211
  with gr.Row():
212
  show_revision_and_timestamp = get_revision_and_ts_checkbox()
213
+ with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
214
+ with gr.TabItem("Retriever + Reranker", id=20):
215
+ with gr.Row():
216
+ with gr.Column():
217
+ search_bar = get_search_bar()
218
+ # select reranking model
219
+ with gr.Column():
220
+ selected_rerankings = get_reranking_dropdown(reranking_models)
221
+
222
+ lb_table = get_leaderboard_table(
223
+ leaderboard_df_long_doc, types_long_doc
224
+ )
225
+
226
+ # Dummy leaderboard for handling the case when the user uses backspace key
227
+ hidden_lb_table_for_search = get_leaderboard_table(
228
+ original_df_long_doc, types_long_doc, visible=False
229
+ )
230
+
231
+ set_listeners(
232
+ "long-doc",
233
+ lb_table,
234
+ hidden_lb_table_for_search,
235
+ search_bar,
236
+ selected_domains,
237
+ selected_langs,
238
+ selected_rerankings,
239
+ show_anonymous,
240
+ show_revision_and_timestamp,
241
+ )
242
+
243
+ # set metric listener
244
+ selected_metric.change(
245
+ update_metric_long_doc,
246
+ [
247
+ selected_metric,
248
+ selected_domains,
249
+ selected_langs,
250
+ selected_rerankings,
251
+ search_bar,
252
+ show_anonymous,
253
+ show_revision_and_timestamp
254
+ ],
255
+ lb_table,
256
+ queue=True
257
+ )
258
+
259
+ with gr.TabItem("Retriever Only", id=21):
260
+ with gr.Column():
261
+ search_bar_retriever = get_search_bar()
262
+ selected_noreranker = get_noreranking_dropdown()
263
+ lb_df_retriever_long_doc = leaderboard_df_long_doc[
264
+ leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
265
+ ]
266
+ lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
267
+ hidden_lb_db_retriever_long_doc = original_df_long_doc[
268
+ original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
269
+ ]
270
+ hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
271
+ lb_table_retriever_long_doc = get_leaderboard_table(
272
+ lb_df_retriever_long_doc, types_long_doc)
273
+ hidden_lb_table_retriever_long_doc = get_leaderboard_table(
274
+ hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
275
+ )
276
+
277
+ set_listeners(
278
+ "long-doc",
279
+ lb_table_retriever_long_doc,
280
+ hidden_lb_table_retriever_long_doc,
281
+ search_bar_retriever,
282
+ selected_domains,
283
+ selected_langs,
284
+ selected_noreranker,
285
+ show_anonymous,
286
+ show_revision_and_timestamp,
287
+ )
288
 
289
+ selected_metric.change(
290
+ update_metric_long_doc,
291
+ [
292
+ selected_metric,
293
+ selected_domains,
294
+ selected_langs,
295
+ selected_noreranker,
296
+ search_bar_retriever,
297
+ show_anonymous,
298
+ show_revision_and_timestamp,
299
+ ],
300
+ lb_table_retriever_long_doc,
301
+ queue=True
302
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
305
  with gr.Column():
src/about.py CHANGED
@@ -1,6 +1,6 @@
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
3
- (Preview) </h1>"""
4
 
5
  # What does your leaderboard evaluate?
6
  INTRODUCTION_TEXT = """
 
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
3
+ (v0.0.2) </h1>"""
4
 
5
  # What does your leaderboard evaluate?
6
  INTRODUCTION_TEXT = """
src/display/formatting.py CHANGED
@@ -4,7 +4,7 @@ def model_hyperlink(link, model_name):
4
 
5
  def make_clickable_model(model_name: str, model_link: str):
6
  # link = f"https://huggingface.co/{model_name}"
7
- if not model_link or not model_link.startswith("https://"):
8
  return model_name
9
  return model_hyperlink(model_link, model_name)
10
 
 
4
 
5
  def make_clickable_model(model_name: str, model_link: str):
6
  # link = f"https://huggingface.co/{model_name}"
7
+ if not model_link or not model_link.startswith("https://") or model_name == "BM25":
8
  return model_name
9
  return model_hyperlink(model_link, model_name)
10
 
src/display/gradio_formatting.py CHANGED
@@ -28,7 +28,14 @@ def get_reranking_dropdown(model_list):
28
  multiselect=True
29
  )
30
 
31
-
 
 
 
 
 
 
 
32
  def get_noreranker_button():
33
  return gr.Button(
34
  value="Only show results without ranking models",
 
28
  multiselect=True
29
  )
30
 
31
+ def get_noreranking_dropdown():
32
+ return gr.Dropdown(
33
+ choices=["NoReranker",],
34
+ value=["NoReranker",],
35
+ interactive=False,
36
+ multiselect=True,
37
+ visible=False
38
+ )
39
  def get_noreranker_button():
40
  return gr.Button(
41
  value="Only show results without ranking models",
src/display/gradio_listener.py CHANGED
@@ -1,4 +1,4 @@
1
- from src.utils import update_table, update_table_long_doc, clear_reranking_selections
2
 
3
 
4
  def set_listeners(
@@ -6,7 +6,6 @@ def set_listeners(
6
  displayed_leaderboard,
7
  hidden_leaderboard,
8
  search_bar,
9
- select_noreranker_only_btn,
10
  selected_domains,
11
  selected_langs,
12
  selected_rerankings,
@@ -52,9 +51,3 @@ def set_listeners(
52
  displayed_leaderboard,
53
  queue=True,
54
  )
55
-
56
-
57
- select_noreranker_only_btn.click(
58
- clear_reranking_selections,
59
- outputs=selected_rerankings
60
- )
 
1
+ from src.utils import update_table, update_table_long_doc
2
 
3
 
4
  def set_listeners(
 
6
  displayed_leaderboard,
7
  hidden_leaderboard,
8
  search_bar,
 
9
  selected_domains,
10
  selected_langs,
11
  selected_rerankings,
 
51
  displayed_leaderboard,
52
  queue=True,
53
  )
 
 
 
 
 
 
src/read_evals.py CHANGED
@@ -25,6 +25,7 @@ from src.display.utils import (
25
 
26
  from src.display.formatting import make_clickable_model
27
 
 
28
 
29
  def calculate_mean(row):
30
  if pd.isna(row).any():
 
25
 
26
  from src.display.formatting import make_clickable_model
27
 
28
+ pd.options.mode.copy_on_write = True
29
 
30
  def calculate_mean(row):
31
  if pd.isna(row).any():
src/utils.py CHANGED
@@ -120,7 +120,7 @@ def select_columns(
120
  filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
121
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
122
  filtered_df.reset_index(inplace=True, drop=True)
123
- filtered_df[COL_NAME_RANK] = filtered_df[COL_NAME_AVG].rank(ascending=False, method="min")
124
 
125
  return filtered_df
126
 
@@ -312,5 +312,6 @@ def submit_results(
312
  )
313
 
314
 
315
- def clear_reranking_selections():
316
- return ["NoReranker",]
 
 
120
  filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
121
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
122
  filtered_df.reset_index(inplace=True, drop=True)
123
+ filtered_df = reset_rank(filtered_df)
124
 
125
  return filtered_df
126
 
 
312
  )
313
 
314
 
315
+ def reset_rank(df):
316
+ df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
317
+ return df