Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add versioning for the qa retrieval
Browse files- app.py +17 -8
- src/envs.py +1 -1
- src/utils.py +0 -6
app.py
CHANGED
@@ -14,7 +14,7 @@ from src.display.css_html_js import custom_css
|
|
14 |
from src.envs import (
|
15 |
API,
|
16 |
EVAL_RESULTS_PATH,
|
17 |
-
REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL
|
18 |
)
|
19 |
from src.loaders import (
|
20 |
load_eval_results
|
@@ -22,7 +22,8 @@ from src.loaders import (
|
|
22 |
from src.utils import (
|
23 |
update_metric,
|
24 |
set_listeners,
|
25 |
-
reset_rank
|
|
|
26 |
)
|
27 |
from src.display.gradio_formatting import (
|
28 |
get_version_dropdown,
|
@@ -183,6 +184,7 @@ with demo:
|
|
183 |
lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
184 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
185 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, datastore.types_qa)
|
|
|
186 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
187 |
hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
188 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
@@ -222,9 +224,8 @@ with demo:
|
|
222 |
lb_table_retriever,
|
223 |
queue=True
|
224 |
)
|
225 |
-
"""
|
226 |
with gr.TabItem("Reranking Only", id=12):
|
227 |
-
lb_df_reranker =
|
228 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
229 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
230 |
with gr.Row():
|
@@ -232,11 +233,18 @@ with demo:
|
|
232 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
233 |
with gr.Column(scale=1):
|
234 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
235 |
-
lb_table_reranker = get_leaderboard_table(lb_df_reranker,
|
236 |
-
|
|
|
237 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
238 |
hidden_lb_table_reranker = get_leaderboard_table(
|
239 |
-
hidden_lb_df_reranker,
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
)
|
241 |
|
242 |
set_listeners(
|
@@ -244,6 +252,7 @@ with demo:
|
|
244 |
lb_table_reranker,
|
245 |
hidden_lb_table_reranker,
|
246 |
search_bar_reranker,
|
|
|
247 |
selected_domains,
|
248 |
selected_langs,
|
249 |
selected_rerankings_reranker,
|
@@ -261,11 +270,11 @@ with demo:
|
|
261 |
search_bar_reranker,
|
262 |
show_anonymous,
|
263 |
show_revision_and_timestamp,
|
264 |
-
selected_version,
|
265 |
],
|
266 |
lb_table_reranker,
|
267 |
queue=True
|
268 |
)
|
|
|
269 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
270 |
with gr.Row():
|
271 |
with gr.Column(min_width=320):
|
|
|
14 |
from src.envs import (
|
15 |
API,
|
16 |
EVAL_RESULTS_PATH,
|
17 |
+
REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, BM25_LINK
|
18 |
)
|
19 |
from src.loaders import (
|
20 |
load_eval_results
|
|
|
22 |
from src.utils import (
|
23 |
update_metric,
|
24 |
set_listeners,
|
25 |
+
reset_rank,
|
26 |
+
remove_html
|
27 |
)
|
28 |
from src.display.gradio_formatting import (
|
29 |
get_version_dropdown,
|
|
|
184 |
lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
185 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
186 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, datastore.types_qa)
|
187 |
+
|
188 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
189 |
hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
190 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
|
|
224 |
lb_table_retriever,
|
225 |
queue=True
|
226 |
)
|
|
|
227 |
with gr.TabItem("Reranking Only", id=12):
|
228 |
+
lb_df_reranker = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
229 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
230 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
231 |
with gr.Row():
|
|
|
233 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
234 |
with gr.Column(scale=1):
|
235 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
236 |
+
lb_table_reranker = get_leaderboard_table(lb_df_reranker, datastore.types_qa)
|
237 |
+
|
238 |
+
hidden_lb_df_reranker = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
239 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
240 |
hidden_lb_table_reranker = get_leaderboard_table(
|
241 |
+
hidden_lb_df_reranker, datastore.types_qa, visible=False
|
242 |
+
)
|
243 |
+
|
244 |
+
selected_version.change(
|
245 |
+
update_datastore,
|
246 |
+
[selected_version,],
|
247 |
+
[selected_domains, selected_langs, selected_rerankings_reranker, lb_table_reranker, hidden_lb_table_reranker]
|
248 |
)
|
249 |
|
250 |
set_listeners(
|
|
|
252 |
lb_table_reranker,
|
253 |
hidden_lb_table_reranker,
|
254 |
search_bar_reranker,
|
255 |
+
selected_version,
|
256 |
selected_domains,
|
257 |
selected_langs,
|
258 |
selected_rerankings_reranker,
|
|
|
270 |
search_bar_reranker,
|
271 |
show_anonymous,
|
272 |
show_revision_and_timestamp,
|
|
|
273 |
],
|
274 |
lb_table_reranker,
|
275 |
queue=True
|
276 |
)
|
277 |
+
"""
|
278 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
279 |
with gr.Row():
|
280 |
with gr.Column(min_width=320):
|
src/envs.py
CHANGED
@@ -30,7 +30,7 @@ BENCHMARK_VERSION_LIST = [
|
|
30 |
"AIR-Bench_24.05",
|
31 |
]
|
32 |
|
33 |
-
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[
|
34 |
DEFAULT_METRIC_QA = "ndcg_at_10"
|
35 |
DEFAULT_METRIC_LONG_DOC = "recall_at_10"
|
36 |
METRIC_LIST = [
|
|
|
30 |
"AIR-Bench_24.05",
|
31 |
]
|
32 |
|
33 |
+
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[0]
|
34 |
DEFAULT_METRIC_QA = "ndcg_at_10"
|
35 |
DEFAULT_METRIC_LONG_DOC = "recall_at_10"
|
36 |
METRIC_LIST = [
|
src/utils.py
CHANGED
@@ -177,10 +177,6 @@ def _update_table(
|
|
177 |
show_revision_and_timestamp: bool = False
|
178 |
):
|
179 |
version_slug = get_safe_name(version)[-4:]
|
180 |
-
if isinstance(hidden_df, str):
|
181 |
-
print(f"task: {task}")
|
182 |
-
print(f"version: {version}")
|
183 |
-
print(f"hidden_df is a string: {hidden_df}")
|
184 |
filtered_df = hidden_df.copy()
|
185 |
if not show_anonymous:
|
186 |
filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
|
@@ -257,7 +253,6 @@ def upload_file(filepath: str):
|
|
257 |
return filepath
|
258 |
|
259 |
|
260 |
-
|
261 |
def get_iso_format_timestamp():
|
262 |
# Get the current timestamp with UTC as the timezone
|
263 |
current_timestamp = datetime.now(timezone.utc)
|
@@ -377,7 +372,6 @@ def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
|
|
377 |
for v in raw_data:
|
378 |
all_data_json += v.to_dict(task=task, metric=metric)
|
379 |
df = pd.DataFrame.from_records(all_data_json)
|
380 |
-
# print(f'dataframe created: {df.shape}')
|
381 |
|
382 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
383 |
|
|
|
177 |
show_revision_and_timestamp: bool = False
|
178 |
):
|
179 |
version_slug = get_safe_name(version)[-4:]
|
|
|
|
|
|
|
|
|
180 |
filtered_df = hidden_df.copy()
|
181 |
if not show_anonymous:
|
182 |
filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
|
|
|
253 |
return filepath
|
254 |
|
255 |
|
|
|
256 |
def get_iso_format_timestamp():
|
257 |
# Get the current timestamp with UTC as the timezone
|
258 |
current_timestamp = datetime.now(timezone.utc)
|
|
|
372 |
for v in raw_data:
|
373 |
all_data_json += v.to_dict(task=task, metric=metric)
|
374 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
375 |
|
376 |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
|
377 |
|