Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
cbde346
1 Parent(s): c1df819

feat: add tabs for noreranker

Browse files
app.py CHANGED
@@ -11,11 +11,11 @@ from src.about import (
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
  DEFAULT_METRIC
13
  from src.display.css_html_js import custom_css
14
- from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
  from src.utils import update_metric, upload_file, get_default_cols, submit_results
18
- from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, get_noreranker_button, get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table
19
  from src.display.gradio_listener import set_listeners
20
 
21
  def restart_space():
@@ -82,6 +82,13 @@ def update_metric_long_doc(
82
 
83
 
84
  demo = gr.Blocks(css=custom_css)
 
 
 
 
 
 
 
85
  with demo:
86
  gr.HTML(TITLE)
87
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
@@ -89,64 +96,101 @@ with demo:
89
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
90
  with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
91
  with gr.Row():
92
- with gr.Column():
93
- # search retrieval models
94
- with gr.Row():
95
- selected_version = get_version_dropdown()
96
- with gr.Row():
97
- search_bar = get_search_bar()
98
- with gr.Row():
99
- selected_rerankings = get_reranking_dropdown(reranking_models)
100
- with gr.Row():
101
- select_noreranker_only_btn = get_noreranker_button()
102
-
103
  with gr.Column(min_width=320):
104
- # select the metric
105
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
106
  # select domain
107
  with gr.Row():
108
  selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
109
  # select language
110
  with gr.Row():
111
  selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
 
 
 
 
 
 
112
  with gr.Row():
113
  show_anonymous = get_anonymous_checkbox()
114
  with gr.Row():
115
  show_revision_and_timestamp = get_revision_and_ts_checkbox()
116
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
 
 
 
 
 
 
 
 
 
 
119
 
120
- # Dummy leaderboard for handling the case when the user uses backspace key
121
- hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- set_listeners(
124
- "qa",
125
- leaderboard_table,
126
- hidden_leaderboard_table_for_search,
127
- search_bar,
128
- select_noreranker_only_btn,
129
- selected_domains,
130
- selected_langs,
131
- selected_rerankings,
132
- show_anonymous,
133
- show_revision_and_timestamp,
134
- )
135
 
136
- # set metric listener
137
- selected_metric.change(
138
- update_metric_qa,
139
- [
140
- selected_metric,
141
- selected_domains,
142
- selected_langs,
143
- selected_rerankings,
144
- search_bar,
145
- show_anonymous,
146
- ],
147
- leaderboard_table,
148
- queue=True
149
- )
150
 
151
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
152
  with gr.Row():
@@ -191,7 +235,6 @@ with demo:
191
  leaderboard_table,
192
  hidden_leaderboard_table_for_search,
193
  search_bar,
194
- select_noreranker_only_btn,
195
  selected_domains,
196
  selected_langs,
197
  selected_rerankings,
 
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
  DEFAULT_METRIC
13
  from src.display.css_html_js import custom_css
14
+ from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_AVG
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
  from src.utils import update_metric, upload_file, get_default_cols, submit_results
18
+ from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, get_noreranker_button, get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table, get_noreranking_dropdown
19
  from src.display.gradio_listener import set_listeners
20
 
21
  def restart_space():
 
82
 
83
 
84
  demo = gr.Blocks(css=custom_css)
85
+
86
+
87
+ def reset_rank(df):
88
+ df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
89
+ return df
90
+
91
+
92
  with demo:
93
  gr.HTML(TITLE)
94
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
96
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
97
  with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
98
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
99
  with gr.Column(min_width=320):
 
 
100
  # select domain
101
  with gr.Row():
102
  selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
103
  # select language
104
  with gr.Row():
105
  selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
106
+
107
+ with gr.Column():
108
+ with gr.Row():
109
+ selected_version = get_version_dropdown()
110
+ # select the metric
111
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
112
  with gr.Row():
113
  show_anonymous = get_anonymous_checkbox()
114
  with gr.Row():
115
  show_revision_and_timestamp = get_revision_and_ts_checkbox()
116
 
117
+ with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
118
+ with gr.TabItem("Retriever + Reranker", id=10):
119
+ with gr.Row():
120
+ # search retrieval models
121
+ with gr.Column():
122
+ search_bar = get_search_bar()
123
+ # select reranking models
124
+ with gr.Column():
125
+ selected_rerankings = get_reranking_dropdown(reranking_models)
126
+ leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
127
+ # Dummy leaderboard for handling the case when the user uses backspace key
128
+ hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
129
 
130
+ set_listeners(
131
+ "qa",
132
+ leaderboard_table,
133
+ hidden_leaderboard_table_for_search,
134
+ search_bar,
135
+ selected_domains,
136
+ selected_langs,
137
+ selected_rerankings,
138
+ show_anonymous,
139
+ show_revision_and_timestamp,
140
+ )
141
 
142
+ # set metric listener
143
+ selected_metric.change(
144
+ update_metric_qa,
145
+ [
146
+ selected_metric,
147
+ selected_domains,
148
+ selected_langs,
149
+ selected_rerankings,
150
+ search_bar,
151
+ show_anonymous,
152
+ ],
153
+ leaderboard_table,
154
+ queue=True
155
+ )
156
+ with gr.TabItem("Retriever Only", id=11):
157
+ with gr.Column():
158
+ search_bar_retriever = get_search_bar()
159
+ selected_noreranker = get_noreranking_dropdown()
160
+ lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
161
+ lb_df_retriever = reset_rank(lb_df_retriever)
162
+ hidden_lb_db_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
163
+ hidden_lb_db_retriever = reset_rank(hidden_lb_db_retriever)
164
+ lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
165
+ # Dummy leaderboard for handling the case when the user uses backspace key
166
+ hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_db_retriever, types_qa, visible=False)
167
 
168
+ set_listeners(
169
+ "qa",
170
+ lb_table_retriever,
171
+ hidden_lb_table_retriever,
172
+ search_bar_retriever,
173
+ selected_domains,
174
+ selected_langs,
175
+ selected_noreranker,
176
+ show_anonymous,
177
+ show_revision_and_timestamp,
178
+ )
 
179
 
180
+ # set metric listener
181
+ selected_metric.change(
182
+ update_metric_qa,
183
+ [
184
+ selected_metric,
185
+ selected_domains,
186
+ selected_langs,
187
+ selected_noreranker,
188
+ search_bar_retriever,
189
+ show_anonymous,
190
+ ],
191
+ lb_table_retriever,
192
+ queue=True
193
+ )
194
 
195
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
196
  with gr.Row():
 
235
  leaderboard_table,
236
  hidden_leaderboard_table_for_search,
237
  search_bar,
 
238
  selected_domains,
239
  selected_langs,
240
  selected_rerankings,
src/display/formatting.py CHANGED
@@ -4,7 +4,7 @@ def model_hyperlink(link, model_name):
4
 
5
  def make_clickable_model(model_name: str, model_link: str):
6
  # link = f"https://huggingface.co/{model_name}"
7
- if not model_link or not model_link.startswith("https://"):
8
  return model_name
9
  return model_hyperlink(model_link, model_name)
10
 
 
4
 
5
  def make_clickable_model(model_name: str, model_link: str):
6
  # link = f"https://huggingface.co/{model_name}"
7
+ if not model_link or not model_link.startswith("https://") or model_name == "BM25":
8
  return model_name
9
  return model_hyperlink(model_link, model_name)
10
 
src/display/gradio_formatting.py CHANGED
@@ -28,7 +28,14 @@ def get_reranking_dropdown(model_list):
28
  multiselect=True
29
  )
30
 
31
-
 
 
 
 
 
 
 
32
  def get_noreranker_button():
33
  return gr.Button(
34
  value="Only show results without ranking models",
 
28
  multiselect=True
29
  )
30
 
31
+ def get_noreranking_dropdown():
32
+ return gr.Dropdown(
33
+ choices=["NoReranker",],
34
+ value=["NoReranker",],
35
+ interactive=False,
36
+ multiselect=True,
37
+ visible=False
38
+ )
39
  def get_noreranker_button():
40
  return gr.Button(
41
  value="Only show results without ranking models",
src/display/gradio_listener.py CHANGED
@@ -6,7 +6,6 @@ def set_listeners(
6
  displayed_leaderboard,
7
  hidden_leaderboard,
8
  search_bar,
9
- select_noreranker_only_btn,
10
  selected_domains,
11
  selected_langs,
12
  selected_rerankings,
@@ -52,9 +51,3 @@ def set_listeners(
52
  displayed_leaderboard,
53
  queue=True,
54
  )
55
-
56
-
57
- select_noreranker_only_btn.click(
58
- clear_reranking_selections,
59
- outputs=selected_rerankings
60
- )
 
6
  displayed_leaderboard,
7
  hidden_leaderboard,
8
  search_bar,
 
9
  selected_domains,
10
  selected_langs,
11
  selected_rerankings,
 
51
  displayed_leaderboard,
52
  queue=True,
53
  )
 
 
 
 
 
 
src/read_evals.py CHANGED
@@ -25,6 +25,7 @@ from src.display.utils import (
25
 
26
  from src.display.formatting import make_clickable_model
27
 
 
28
 
29
  def calculate_mean(row):
30
  if pd.isna(row).any():
 
25
 
26
  from src.display.formatting import make_clickable_model
27
 
28
+ pd.options.mode.copy_on_write = True
29
 
30
  def calculate_mean(row):
31
  if pd.isna(row).any():