Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
23b3543
1 Parent(s): e050b39

feat: add versioning for the qa retrieval

Browse files
Files changed (3) hide show
  1. app.py +17 -8
  2. src/envs.py +1 -1
  3. src/utils.py +0 -6
app.py CHANGED
@@ -14,7 +14,7 @@ from src.display.css_html_js import custom_css
14
  from src.envs import (
15
  API,
16
  EVAL_RESULTS_PATH,
17
- REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL
18
  )
19
  from src.loaders import (
20
  load_eval_results
@@ -22,7 +22,8 @@ from src.loaders import (
22
  from src.utils import (
23
  update_metric,
24
  set_listeners,
25
- reset_rank
 
26
  )
27
  from src.display.gradio_formatting import (
28
  get_version_dropdown,
@@ -183,6 +184,7 @@ with demo:
183
  lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
184
  lb_df_retriever = reset_rank(lb_df_retriever)
185
  lb_table_retriever = get_leaderboard_table(lb_df_retriever, datastore.types_qa)
 
186
  # Dummy leaderboard for handling the case when the user uses backspace key
187
  hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
188
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
@@ -222,9 +224,8 @@ with demo:
222
  lb_table_retriever,
223
  queue=True
224
  )
225
- """
226
  with gr.TabItem("Reranking Only", id=12):
227
- lb_df_reranker = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
228
  lb_df_reranker = reset_rank(lb_df_reranker)
229
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
230
  with gr.Row():
@@ -232,11 +233,18 @@ with demo:
232
  selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
233
  with gr.Column(scale=1):
234
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
235
- lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
236
- hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_df_qa[data["AIR-Bench_24.04"].raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
 
237
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
238
  hidden_lb_table_reranker = get_leaderboard_table(
239
- hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
 
 
 
 
 
 
240
  )
241
 
242
  set_listeners(
@@ -244,6 +252,7 @@ with demo:
244
  lb_table_reranker,
245
  hidden_lb_table_reranker,
246
  search_bar_reranker,
 
247
  selected_domains,
248
  selected_langs,
249
  selected_rerankings_reranker,
@@ -261,11 +270,11 @@ with demo:
261
  search_bar_reranker,
262
  show_anonymous,
263
  show_revision_and_timestamp,
264
- selected_version,
265
  ],
266
  lb_table_reranker,
267
  queue=True
268
  )
 
269
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
270
  with gr.Row():
271
  with gr.Column(min_width=320):
 
14
  from src.envs import (
15
  API,
16
  EVAL_RESULTS_PATH,
17
+ REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, BM25_LINK
18
  )
19
  from src.loaders import (
20
  load_eval_results
 
22
  from src.utils import (
23
  update_metric,
24
  set_listeners,
25
+ reset_rank,
26
+ remove_html
27
  )
28
  from src.display.gradio_formatting import (
29
  get_version_dropdown,
 
184
  lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
185
  lb_df_retriever = reset_rank(lb_df_retriever)
186
  lb_table_retriever = get_leaderboard_table(lb_df_retriever, datastore.types_qa)
187
+
188
  # Dummy leaderboard for handling the case when the user uses backspace key
189
  hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
190
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
 
224
  lb_table_retriever,
225
  queue=True
226
  )
 
227
  with gr.TabItem("Reranking Only", id=12):
228
+ lb_df_reranker = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
229
  lb_df_reranker = reset_rank(lb_df_reranker)
230
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
231
  with gr.Row():
 
233
  selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
234
  with gr.Column(scale=1):
235
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
236
+ lb_table_reranker = get_leaderboard_table(lb_df_reranker, datastore.types_qa)
237
+
238
+ hidden_lb_df_reranker = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
239
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
240
  hidden_lb_table_reranker = get_leaderboard_table(
241
+ hidden_lb_df_reranker, datastore.types_qa, visible=False
242
+ )
243
+
244
+ selected_version.change(
245
+ update_datastore,
246
+ [selected_version,],
247
+ [selected_domains, selected_langs, selected_rerankings_reranker, lb_table_reranker, hidden_lb_table_reranker]
248
  )
249
 
250
  set_listeners(
 
252
  lb_table_reranker,
253
  hidden_lb_table_reranker,
254
  search_bar_reranker,
255
+ selected_version,
256
  selected_domains,
257
  selected_langs,
258
  selected_rerankings_reranker,
 
270
  search_bar_reranker,
271
  show_anonymous,
272
  show_revision_and_timestamp,
 
273
  ],
274
  lb_table_reranker,
275
  queue=True
276
  )
277
+ """
278
  with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
279
  with gr.Row():
280
  with gr.Column(min_width=320):
src/envs.py CHANGED
@@ -30,7 +30,7 @@ BENCHMARK_VERSION_LIST = [
30
  "AIR-Bench_24.05",
31
  ]
32
 
33
- LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
34
  DEFAULT_METRIC_QA = "ndcg_at_10"
35
  DEFAULT_METRIC_LONG_DOC = "recall_at_10"
36
  METRIC_LIST = [
 
30
  "AIR-Bench_24.05",
31
  ]
32
 
33
+ LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[0]
34
  DEFAULT_METRIC_QA = "ndcg_at_10"
35
  DEFAULT_METRIC_LONG_DOC = "recall_at_10"
36
  METRIC_LIST = [
src/utils.py CHANGED
@@ -177,10 +177,6 @@ def _update_table(
177
  show_revision_and_timestamp: bool = False
178
  ):
179
  version_slug = get_safe_name(version)[-4:]
180
- if isinstance(hidden_df, str):
181
- print(f"task: {task}")
182
- print(f"version: {version}")
183
- print(f"hidden_df is a string: {hidden_df}")
184
  filtered_df = hidden_df.copy()
185
  if not show_anonymous:
186
  filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
@@ -257,7 +253,6 @@ def upload_file(filepath: str):
257
  return filepath
258
 
259
 
260
-
261
  def get_iso_format_timestamp():
262
  # Get the current timestamp with UTC as the timezone
263
  current_timestamp = datetime.now(timezone.utc)
@@ -377,7 +372,6 @@ def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
377
  for v in raw_data:
378
  all_data_json += v.to_dict(task=task, metric=metric)
379
  df = pd.DataFrame.from_records(all_data_json)
380
- # print(f'dataframe created: {df.shape}')
381
 
382
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
383
 
 
177
  show_revision_and_timestamp: bool = False
178
  ):
179
  version_slug = get_safe_name(version)[-4:]
 
 
 
 
180
  filtered_df = hidden_df.copy()
181
  if not show_anonymous:
182
  filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
 
253
  return filepath
254
 
255
 
 
256
  def get_iso_format_timestamp():
257
  # Get the current timestamp with UTC as the timezone
258
  current_timestamp = datetime.now(timezone.utc)
 
372
  for v in raw_data:
373
  all_data_json += v.to_dict(task=task, metric=metric)
374
  df = pd.DataFrame.from_records(all_data_json)
 
375
 
376
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
377