Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add tabs for noreranker
Browse files- app.py +88 -45
- src/display/formatting.py +1 -1
- src/display/gradio_formatting.py +8 -1
- src/display/gradio_listener.py +0 -7
- src/read_evals.py +1 -0
app.py
CHANGED
@@ -11,11 +11,11 @@ from src.about import (
|
|
11 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
|
12 |
DEFAULT_METRIC
|
13 |
from src.display.css_html_js import custom_css
|
14 |
-
from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP
|
15 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
16 |
from src.read_evals import get_raw_eval_results, get_leaderboard_df
|
17 |
from src.utils import update_metric, upload_file, get_default_cols, submit_results
|
18 |
-
from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, get_noreranker_button, get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table
|
19 |
from src.display.gradio_listener import set_listeners
|
20 |
|
21 |
def restart_space():
|
@@ -82,6 +82,13 @@ def update_metric_long_doc(
|
|
82 |
|
83 |
|
84 |
demo = gr.Blocks(css=custom_css)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
with demo:
|
86 |
gr.HTML(TITLE)
|
87 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
@@ -89,64 +96,101 @@ with demo:
|
|
89 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
90 |
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
|
91 |
with gr.Row():
|
92 |
-
with gr.Column():
|
93 |
-
# search retrieval models
|
94 |
-
with gr.Row():
|
95 |
-
selected_version = get_version_dropdown()
|
96 |
-
with gr.Row():
|
97 |
-
search_bar = get_search_bar()
|
98 |
-
with gr.Row():
|
99 |
-
selected_rerankings = get_reranking_dropdown(reranking_models)
|
100 |
-
with gr.Row():
|
101 |
-
select_noreranker_only_btn = get_noreranker_button()
|
102 |
-
|
103 |
with gr.Column(min_width=320):
|
104 |
-
# select the metric
|
105 |
-
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
|
106 |
# select domain
|
107 |
with gr.Row():
|
108 |
selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
|
109 |
# select language
|
110 |
with gr.Row():
|
111 |
selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
with gr.Row():
|
113 |
show_anonymous = get_anonymous_checkbox()
|
114 |
with gr.Row():
|
115 |
show_revision_and_timestamp = get_revision_and_ts_checkbox()
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
)
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
|
151 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
152 |
with gr.Row():
|
@@ -191,7 +235,6 @@ with demo:
|
|
191 |
leaderboard_table,
|
192 |
hidden_leaderboard_table_for_search,
|
193 |
search_bar,
|
194 |
-
select_noreranker_only_btn,
|
195 |
selected_domains,
|
196 |
selected_langs,
|
197 |
selected_rerankings,
|
|
|
11 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
|
12 |
DEFAULT_METRIC
|
13 |
from src.display.css_html_js import custom_css
|
14 |
+
from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_AVG
|
15 |
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
16 |
from src.read_evals import get_raw_eval_results, get_leaderboard_df
|
17 |
from src.utils import update_metric, upload_file, get_default_cols, submit_results
|
18 |
+
from src.display.gradio_formatting import get_version_dropdown, get_search_bar, get_reranking_dropdown, get_noreranker_button, get_metric_dropdown, get_domain_dropdown, get_language_dropdown, get_anonymous_checkbox, get_revision_and_ts_checkbox, get_leaderboard_table, get_noreranking_dropdown
|
19 |
from src.display.gradio_listener import set_listeners
|
20 |
|
21 |
def restart_space():
|
|
|
82 |
|
83 |
|
84 |
demo = gr.Blocks(css=custom_css)
|
85 |
+
|
86 |
+
|
87 |
+
def reset_rank(df):
|
88 |
+
df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
|
89 |
+
return df
|
90 |
+
|
91 |
+
|
92 |
with demo:
|
93 |
gr.HTML(TITLE)
|
94 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
96 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
97 |
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
|
98 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
with gr.Column(min_width=320):
|
|
|
|
|
100 |
# select domain
|
101 |
with gr.Row():
|
102 |
selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
|
103 |
# select language
|
104 |
with gr.Row():
|
105 |
selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
|
106 |
+
|
107 |
+
with gr.Column():
|
108 |
+
with gr.Row():
|
109 |
+
selected_version = get_version_dropdown()
|
110 |
+
# select the metric
|
111 |
+
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC)
|
112 |
with gr.Row():
|
113 |
show_anonymous = get_anonymous_checkbox()
|
114 |
with gr.Row():
|
115 |
show_revision_and_timestamp = get_revision_and_ts_checkbox()
|
116 |
|
117 |
+
with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
|
118 |
+
with gr.TabItem("Retriever + Reranker", id=10):
|
119 |
+
with gr.Row():
|
120 |
+
# search retrieval models
|
121 |
+
with gr.Column():
|
122 |
+
search_bar = get_search_bar()
|
123 |
+
# select reranking models
|
124 |
+
with gr.Column():
|
125 |
+
selected_rerankings = get_reranking_dropdown(reranking_models)
|
126 |
+
leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
|
127 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
128 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
|
129 |
|
130 |
+
set_listeners(
|
131 |
+
"qa",
|
132 |
+
leaderboard_table,
|
133 |
+
hidden_leaderboard_table_for_search,
|
134 |
+
search_bar,
|
135 |
+
selected_domains,
|
136 |
+
selected_langs,
|
137 |
+
selected_rerankings,
|
138 |
+
show_anonymous,
|
139 |
+
show_revision_and_timestamp,
|
140 |
+
)
|
141 |
|
142 |
+
# set metric listener
|
143 |
+
selected_metric.change(
|
144 |
+
update_metric_qa,
|
145 |
+
[
|
146 |
+
selected_metric,
|
147 |
+
selected_domains,
|
148 |
+
selected_langs,
|
149 |
+
selected_rerankings,
|
150 |
+
search_bar,
|
151 |
+
show_anonymous,
|
152 |
+
],
|
153 |
+
leaderboard_table,
|
154 |
+
queue=True
|
155 |
+
)
|
156 |
+
with gr.TabItem("Retriever Only", id=11):
|
157 |
+
with gr.Column():
|
158 |
+
search_bar_retriever = get_search_bar()
|
159 |
+
selected_noreranker = get_noreranking_dropdown()
|
160 |
+
lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
161 |
+
lb_df_retriever = reset_rank(lb_df_retriever)
|
162 |
+
hidden_lb_db_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
163 |
+
hidden_lb_db_retriever = reset_rank(hidden_lb_db_retriever)
|
164 |
+
lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
|
165 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
166 |
+
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_db_retriever, types_qa, visible=False)
|
167 |
|
168 |
+
set_listeners(
|
169 |
+
"qa",
|
170 |
+
lb_table_retriever,
|
171 |
+
hidden_lb_table_retriever,
|
172 |
+
search_bar_retriever,
|
173 |
+
selected_domains,
|
174 |
+
selected_langs,
|
175 |
+
selected_noreranker,
|
176 |
+
show_anonymous,
|
177 |
+
show_revision_and_timestamp,
|
178 |
+
)
|
|
|
179 |
|
180 |
+
# set metric listener
|
181 |
+
selected_metric.change(
|
182 |
+
update_metric_qa,
|
183 |
+
[
|
184 |
+
selected_metric,
|
185 |
+
selected_domains,
|
186 |
+
selected_langs,
|
187 |
+
selected_noreranker,
|
188 |
+
search_bar_retriever,
|
189 |
+
show_anonymous,
|
190 |
+
],
|
191 |
+
lb_table_retriever,
|
192 |
+
queue=True
|
193 |
+
)
|
194 |
|
195 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
196 |
with gr.Row():
|
|
|
235 |
leaderboard_table,
|
236 |
hidden_leaderboard_table_for_search,
|
237 |
search_bar,
|
|
|
238 |
selected_domains,
|
239 |
selected_langs,
|
240 |
selected_rerankings,
|
src/display/formatting.py
CHANGED
@@ -4,7 +4,7 @@ def model_hyperlink(link, model_name):
|
|
4 |
|
5 |
def make_clickable_model(model_name: str, model_link: str):
|
6 |
# link = f"https://huggingface.co/{model_name}"
|
7 |
-
if not model_link or not model_link.startswith("https://"):
|
8 |
return model_name
|
9 |
return model_hyperlink(model_link, model_name)
|
10 |
|
|
|
4 |
|
5 |
def make_clickable_model(model_name: str, model_link: str):
|
6 |
# link = f"https://huggingface.co/{model_name}"
|
7 |
+
if not model_link or not model_link.startswith("https://") or model_name == "BM25":
|
8 |
return model_name
|
9 |
return model_hyperlink(model_link, model_name)
|
10 |
|
src/display/gradio_formatting.py
CHANGED
@@ -28,7 +28,14 @@ def get_reranking_dropdown(model_list):
|
|
28 |
multiselect=True
|
29 |
)
|
30 |
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def get_noreranker_button():
|
33 |
return gr.Button(
|
34 |
value="Only show results without ranking models",
|
|
|
28 |
multiselect=True
|
29 |
)
|
30 |
|
31 |
+
def get_noreranking_dropdown():
|
32 |
+
return gr.Dropdown(
|
33 |
+
choices=["NoReranker",],
|
34 |
+
value=["NoReranker",],
|
35 |
+
interactive=False,
|
36 |
+
multiselect=True,
|
37 |
+
visible=False
|
38 |
+
)
|
39 |
def get_noreranker_button():
|
40 |
return gr.Button(
|
41 |
value="Only show results without ranking models",
|
src/display/gradio_listener.py
CHANGED
@@ -6,7 +6,6 @@ def set_listeners(
|
|
6 |
displayed_leaderboard,
|
7 |
hidden_leaderboard,
|
8 |
search_bar,
|
9 |
-
select_noreranker_only_btn,
|
10 |
selected_domains,
|
11 |
selected_langs,
|
12 |
selected_rerankings,
|
@@ -52,9 +51,3 @@ def set_listeners(
|
|
52 |
displayed_leaderboard,
|
53 |
queue=True,
|
54 |
)
|
55 |
-
|
56 |
-
|
57 |
-
select_noreranker_only_btn.click(
|
58 |
-
clear_reranking_selections,
|
59 |
-
outputs=selected_rerankings
|
60 |
-
)
|
|
|
6 |
displayed_leaderboard,
|
7 |
hidden_leaderboard,
|
8 |
search_bar,
|
|
|
9 |
selected_domains,
|
10 |
selected_langs,
|
11 |
selected_rerankings,
|
|
|
51 |
displayed_leaderboard,
|
52 |
queue=True,
|
53 |
)
|
|
|
|
|
|
|
|
|
|
|
|
src/read_evals.py
CHANGED
@@ -25,6 +25,7 @@ from src.display.utils import (
|
|
25 |
|
26 |
from src.display.formatting import make_clickable_model
|
27 |
|
|
|
28 |
|
29 |
def calculate_mean(row):
|
30 |
if pd.isna(row).any():
|
|
|
25 |
|
26 |
from src.display.formatting import make_clickable_model
|
27 |
|
28 |
+
pd.options.mode.copy_on_write = True
|
29 |
|
30 |
def calculate_mean(row):
|
31 |
if pd.isna(row).any():
|