Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
2edd122
·
1 Parent(s): a30a228

feat: add rank and language dropdown lists

Browse files
Files changed (4) hide show
  1. app.py +14 -10
  2. src/display/utils.py +4 -0
  3. src/leaderboard/read_evals.py +11 -2
  4. utils.py +23 -18
app.py CHANGED
@@ -12,7 +12,7 @@ from src.display.css_html_js import custom_css
12
  from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
13
 
14
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
- from utils import update_table, update_metric, update_table_long_doc, upload_file
16
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
17
 
18
 
@@ -39,7 +39,12 @@ print(f'QA data loaded: {original_df_qa.shape}')
39
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
40
 
41
  leaderboard_df_qa = original_df_qa.copy()
 
 
 
42
  leaderboard_df_long_doc = original_df_long_doc.copy()
 
 
43
 
44
 
45
  def update_metric_qa(
@@ -97,11 +102,12 @@ with demo:
97
  )
98
  # select language
99
  with gr.Row():
100
- selected_langs = gr.CheckboxGroup(
101
  choices=LANG_COLS_QA,
102
  value=LANG_COLS_QA,
103
  label="Select the languages",
104
  elem_id="language-column-select",
 
105
  interactive=True
106
  )
107
  # select reranking model
@@ -117,8 +123,6 @@ with demo:
117
 
118
  leaderboard_table = gr.components.Dataframe(
119
  value=leaderboard_df_qa,
120
- # headers=shown_columns,
121
- # datatype=TYPES,
122
  elem_id="leaderboard-table",
123
  interactive=False,
124
  visible=True,
@@ -205,11 +209,12 @@ with demo:
205
  )
206
  # select language
207
  with gr.Row():
208
- selected_langs = gr.CheckboxGroup(
209
  choices=LANG_COLS_LONG_DOC,
210
  value=LANG_COLS_LONG_DOC,
211
  label="Select the languages",
212
  elem_id="language-column-select-long-doc",
 
213
  interactive=True
214
  )
215
  # select reranking model
@@ -225,8 +230,6 @@ with demo:
225
 
226
  leaderboard_table_long_doc = gr.components.Dataframe(
227
  value=leaderboard_df_long_doc,
228
- # headers=shown_columns,
229
- # datatype=TYPES,
230
  elem_id="leaderboard-table-long-doc",
231
  interactive=False,
232
  visible=True,
@@ -235,8 +238,6 @@ with demo:
235
  # Dummy leaderboard for handling the case when the user uses backspace key
236
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
237
  value=leaderboard_df_long_doc,
238
- # headers=COLS,
239
- # datatype=TYPES,
240
  visible=False,
241
  )
242
 
@@ -293,7 +294,10 @@ with demo:
293
  with gr.Row():
294
  with gr.Column():
295
  benchmark_version = gr.Dropdown(
296
- ['AIR-Bench_24.04',], value=['AIR-Bench_24.04',], interactive=True, label="AIR-Bench Version")
 
 
 
297
  with gr.Column():
298
  model_name_textbox = gr.Textbox(label="Model name")
299
  with gr.Column():
 
12
  from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
13
 
14
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
+ from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols
16
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
17
 
18
 
 
39
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
40
 
41
  leaderboard_df_qa = original_df_qa.copy()
42
+ shown_columns_qa = get_default_cols('qa', leaderboard_df_qa.columns, add_fix_cols=True)
43
+ leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
44
+
45
  leaderboard_df_long_doc = original_df_long_doc.copy()
46
+ shown_columns_long_doc = get_default_cols('long_doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
47
+ leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
48
 
49
 
50
  def update_metric_qa(
 
102
  )
103
  # select language
104
  with gr.Row():
105
+ selected_langs = gr.Dropdown(
106
  choices=LANG_COLS_QA,
107
  value=LANG_COLS_QA,
108
  label="Select the languages",
109
  elem_id="language-column-select",
110
+ multiselect=True,
111
  interactive=True
112
  )
113
  # select reranking model
 
123
 
124
  leaderboard_table = gr.components.Dataframe(
125
  value=leaderboard_df_qa,
 
 
126
  elem_id="leaderboard-table",
127
  interactive=False,
128
  visible=True,
 
209
  )
210
  # select language
211
  with gr.Row():
212
+ selected_langs = gr.Dropdown(
213
  choices=LANG_COLS_LONG_DOC,
214
  value=LANG_COLS_LONG_DOC,
215
  label="Select the languages",
216
  elem_id="language-column-select-long-doc",
217
+ multiselect=True,
218
  interactive=True
219
  )
220
  # select reranking model
 
230
 
231
  leaderboard_table_long_doc = gr.components.Dataframe(
232
  value=leaderboard_df_long_doc,
 
 
233
  elem_id="leaderboard-table-long-doc",
234
  interactive=False,
235
  visible=True,
 
238
  # Dummy leaderboard for handling the case when the user uses backspace key
239
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
240
  value=leaderboard_df_long_doc,
 
 
241
  visible=False,
242
  )
243
 
 
294
  with gr.Row():
295
  with gr.Column():
296
  benchmark_version = gr.Dropdown(
297
+ ["AIR-Bench_24.04",],
298
+ value="AIR-Bench_24.04",
299
+ interactive=True,
300
+ label="AIR-Bench Version")
301
  with gr.Column():
302
  model_name_textbox = gr.Textbox(label="Model name")
303
  with gr.Column():
src/display/utils.py CHANGED
@@ -22,6 +22,7 @@ class ColumnContent:
22
  COL_NAME_AVG = "Average ⬆️"
23
  COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
24
  COL_NAME_RERANKING_MODEL = "Reranking Model"
 
25
 
26
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
27
  ## Leaderboard columns
@@ -36,6 +37,9 @@ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
36
  auto_eval_column_dict.append(
37
  ["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
38
  )
 
 
 
39
  for benchmark in benchmarks:
40
  auto_eval_column_dict.append(
41
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
 
22
  COL_NAME_AVG = "Average ⬆️"
23
  COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
24
  COL_NAME_RERANKING_MODEL = "Reranking Model"
25
+ COL_NAME_RANK = "Rank 🏆"
26
 
27
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
28
  ## Leaderboard columns
 
37
  auto_eval_column_dict.append(
38
  ["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
39
  )
40
+ auto_eval_column_dict.append(
41
+ ["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)]
42
+ )
43
  for benchmark in benchmarks:
44
  auto_eval_column_dict.append(
45
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
src/leaderboard/read_evals.py CHANGED
@@ -9,8 +9,16 @@ import pandas as pd
9
 
10
  from src.benchmarks import get_safe_name
11
  from src.display.formatting import has_no_nan_values
12
- from src.display.utils import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COLS_QA, QA_BENCHMARK_COLS, \
13
- COLS_LONG_DOC, LONG_DOC_BENCHMARK_COLS, COL_NAME_AVG
 
 
 
 
 
 
 
 
14
 
15
 
16
  @dataclass
@@ -158,6 +166,7 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
158
  df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
159
  df = df.sort_values(by=[COL_NAME_AVG], ascending=False)
160
  df.reset_index(inplace=True)
 
161
 
162
  _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
163
  df = df[_cols].round(decimals=2)
 
9
 
10
  from src.benchmarks import get_safe_name
11
  from src.display.formatting import has_no_nan_values
12
+ from src.display.utils import (
13
+ COL_NAME_RERANKING_MODEL,
14
+ COL_NAME_RETRIEVAL_MODEL,
15
+ COLS_QA,
16
+ QA_BENCHMARK_COLS,
17
+ COLS_LONG_DOC,
18
+ LONG_DOC_BENCHMARK_COLS,
19
+ COL_NAME_AVG,
20
+ COL_NAME_RANK
21
+ )
22
 
23
 
24
  @dataclass
 
166
  df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
167
  df = df.sort_values(by=[COL_NAME_AVG], ascending=False)
168
  df.reset_index(inplace=True)
169
+ df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="dense")
170
 
171
  _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
172
  df = df[_cols].round(decimals=2)
utils.py CHANGED
@@ -3,7 +3,7 @@ from typing import List
3
  import pandas as pd
4
 
5
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
6
- from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC
7
  from src.leaderboard.read_evals import FullEvalResult, get_leaderboard_df
8
 
9
 
@@ -37,25 +37,28 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
37
  return df[(df[AutoEvalColumnQA.retrieval_model.name].str.contains(query, case=False))]
38
 
39
 
40
- def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, task: str = "qa") -> pd.DataFrame:
41
  if task == "qa":
42
- always_here_cols = [
43
- AutoEvalColumnQA.retrieval_model.name,
44
- AutoEvalColumnQA.reranking_model.name,
45
- AutoEvalColumnQA.average.name
46
- ]
47
- cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)))
48
  elif task == "long_doc":
49
- always_here_cols = [
50
- AutoEvalColumnLongDoc.retrieval_model.name,
51
- AutoEvalColumnLongDoc.reranking_model.name,
52
- AutoEvalColumnLongDoc.average.name
53
- ]
54
- cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)))
 
 
 
 
 
 
 
 
 
 
55
  selected_cols = []
56
  for c in cols:
57
- if c not in df.columns:
58
- continue
59
  if task == "qa":
60
  eval_col = BenchmarksQA[c].value
61
  elif task == "long_doc":
@@ -66,8 +69,10 @@ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, t
66
  continue
67
  selected_cols.append(c)
68
  # We use COLS to maintain sorting
69
- filtered_df = df[always_here_cols + selected_cols]
70
- filtered_df[always_here_cols[2]] = filtered_df[selected_cols].mean(axis=1).round(decimals=2)
 
 
71
  return filtered_df
72
 
73
 
 
3
  import pandas as pd
4
 
5
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
6
+ from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
7
  from src.leaderboard.read_evals import FullEvalResult, get_leaderboard_df
8
 
9
 
 
37
  return df[(df[AutoEvalColumnQA.retrieval_model.name].str.contains(query, case=False))]
38
 
39
 
40
+ def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
41
  if task == "qa":
42
+ cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
 
 
 
 
 
43
  elif task == "long_doc":
44
+ cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
45
+ else:
46
+ raise NotImplemented
47
+ if add_fix_cols:
48
+ cols = FIXED_COLS + cols
49
+ return cols
50
+
51
+ FIXED_COLS = [
52
+ COL_NAME_RANK,
53
+ COL_NAME_RETRIEVAL_MODEL,
54
+ COL_NAME_RERANKING_MODEL,
55
+ COL_NAME_AVG,
56
+ ]
57
+
58
+ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, task: str = "qa") -> pd.DataFrame:
59
+ cols = get_default_cols(task=task, columns=df.columns, add_fix_cols=False)
60
  selected_cols = []
61
  for c in cols:
 
 
62
  if task == "qa":
63
  eval_col = BenchmarksQA[c].value
64
  elif task == "long_doc":
 
69
  continue
70
  selected_cols.append(c)
71
  # We use COLS to maintain sorting
72
+ filtered_df = df[FIXED_COLS + selected_cols]
73
+ filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].mean(axis=1).round(decimals=2)
74
+ filtered_df[COL_NAME_RANK] = filtered_df[COL_NAME_AVG].rank(ascending=False, method="dense")
75
+
76
  return filtered_df
77
 
78