Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: adapt the utils in app.py
Browse files- app.py +13 -89
- src/benchmarks.py +10 -5
- src/populate.py +5 -4
- tests/src/test_populate.py +7 -4
- tests/test_utils.py +53 -0
- tests/toydata/test_results/bge-m3/NoReranker/results_demo_2023-12-21T18-10-08.json +1 -1
- utils.py +70 -0
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import gradio as gr
|
2 |
-
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
@@ -10,18 +9,15 @@ from src.about import (
|
|
10 |
)
|
11 |
from src.display.css_html_js import custom_css
|
12 |
from src.display.utils import (
|
13 |
-
|
14 |
COLS,
|
15 |
-
EVAL_COLS,
|
16 |
-
NUMERIC_INTERVALS,
|
17 |
TYPES,
|
18 |
-
|
19 |
-
|
20 |
-
fields,
|
21 |
-
Precision
|
22 |
)
|
23 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
24 |
-
from src.populate import
|
|
|
25 |
|
26 |
|
27 |
def restart_space():
|
@@ -45,9 +41,9 @@ try:
|
|
45 |
except Exception:
|
46 |
restart_space()
|
47 |
|
48 |
-
|
49 |
-
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
50 |
-
leaderboard_df =
|
51 |
|
52 |
# (
|
53 |
# finished_eval_queue_df,
|
@@ -56,78 +52,6 @@ leaderboard_df = original_df.copy()
|
|
56 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
57 |
|
58 |
|
59 |
-
# Searching and filtering
|
60 |
-
def update_table(
|
61 |
-
hidden_df: pd.DataFrame,
|
62 |
-
columns: list,
|
63 |
-
type_query: list,
|
64 |
-
precision_query: str,
|
65 |
-
size_query: list,
|
66 |
-
show_deleted: bool,
|
67 |
-
query: str,
|
68 |
-
):
|
69 |
-
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
70 |
-
filtered_df = filter_queries(query, filtered_df)
|
71 |
-
df = select_columns(filtered_df, columns)
|
72 |
-
return df
|
73 |
-
|
74 |
-
|
75 |
-
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
76 |
-
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
77 |
-
|
78 |
-
|
79 |
-
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
80 |
-
always_here_cols = [
|
81 |
-
AutoEvalColumn.model_type_symbol.name,
|
82 |
-
AutoEvalColumn.model.name,
|
83 |
-
]
|
84 |
-
# We use COLS to maintain sorting
|
85 |
-
filtered_df = df[
|
86 |
-
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
87 |
-
]
|
88 |
-
return filtered_df
|
89 |
-
|
90 |
-
|
91 |
-
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
92 |
-
final_df = []
|
93 |
-
if query != "":
|
94 |
-
queries = [q.strip() for q in query.split(";")]
|
95 |
-
for _q in queries:
|
96 |
-
_q = _q.strip()
|
97 |
-
if _q != "":
|
98 |
-
temp_filtered_df = search_table(filtered_df, _q)
|
99 |
-
if len(temp_filtered_df) > 0:
|
100 |
-
final_df.append(temp_filtered_df)
|
101 |
-
if len(final_df) > 0:
|
102 |
-
filtered_df = pd.concat(final_df)
|
103 |
-
filtered_df = filtered_df.drop_duplicates(
|
104 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
105 |
-
)
|
106 |
-
|
107 |
-
return filtered_df
|
108 |
-
|
109 |
-
|
110 |
-
def filter_models(
|
111 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
112 |
-
) -> pd.DataFrame:
|
113 |
-
# Show all models
|
114 |
-
if show_deleted:
|
115 |
-
filtered_df = df
|
116 |
-
else: # Show only still on the hub models
|
117 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
118 |
-
|
119 |
-
type_emoji = [t[0] for t in type_query]
|
120 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
121 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
122 |
-
|
123 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
124 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
125 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
126 |
-
filtered_df = filtered_df.loc[mask]
|
127 |
-
|
128 |
-
return filtered_df
|
129 |
-
|
130 |
-
|
131 |
demo = gr.Blocks(css=custom_css)
|
132 |
with demo:
|
133 |
gr.HTML(TITLE)
|
@@ -147,12 +71,12 @@ with demo:
|
|
147 |
shown_columns = gr.CheckboxGroup(
|
148 |
choices=[
|
149 |
c.name
|
150 |
-
for c in fields(
|
151 |
if not c.hidden and not c.never_hidden
|
152 |
],
|
153 |
value=[
|
154 |
c.name
|
155 |
-
for c in fields(
|
156 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
157 |
],
|
158 |
label="Select columns to show",
|
@@ -189,10 +113,10 @@ with demo:
|
|
189 |
|
190 |
leaderboard_table = gr.components.Dataframe(
|
191 |
value=leaderboard_df[
|
192 |
-
[c.name for c in fields(
|
193 |
+ shown_columns.value
|
194 |
],
|
195 |
-
headers=[c.name for c in fields(
|
196 |
datatype=TYPES,
|
197 |
elem_id="leaderboard-table",
|
198 |
interactive=False,
|
@@ -201,7 +125,7 @@ with demo:
|
|
201 |
|
202 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
203 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
204 |
-
value=
|
205 |
headers=COLS,
|
206 |
datatype=TYPES,
|
207 |
visible=False,
|
|
|
1 |
import gradio as gr
|
|
|
2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
from huggingface_hub import snapshot_download
|
4 |
|
|
|
9 |
)
|
10 |
from src.display.css_html_js import custom_css
|
11 |
from src.display.utils import (
|
12 |
+
QA_BENCHMARK_COLS,
|
13 |
COLS,
|
|
|
|
|
14 |
TYPES,
|
15 |
+
AutoEvalColumnQA,
|
16 |
+
fields
|
|
|
|
|
17 |
)
|
18 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
19 |
+
from src.populate import get_leaderboard_df
|
20 |
+
from utils import update_table
|
21 |
|
22 |
|
23 |
def restart_space():
|
|
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
|
44 |
+
raw_data_qa, original_df_qa = get_leaderboard_df(
|
45 |
+
EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_1')
|
46 |
+
leaderboard_df = original_df_qa.copy()
|
47 |
|
48 |
# (
|
49 |
# finished_eval_queue_df,
|
|
|
52 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
53 |
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
demo = gr.Blocks(css=custom_css)
|
56 |
with demo:
|
57 |
gr.HTML(TITLE)
|
|
|
71 |
shown_columns = gr.CheckboxGroup(
|
72 |
choices=[
|
73 |
c.name
|
74 |
+
for c in fields(AutoEvalColumnQA)
|
75 |
if not c.hidden and not c.never_hidden
|
76 |
],
|
77 |
value=[
|
78 |
c.name
|
79 |
+
for c in fields(AutoEvalColumnQA)
|
80 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
81 |
],
|
82 |
label="Select columns to show",
|
|
|
113 |
|
114 |
leaderboard_table = gr.components.Dataframe(
|
115 |
value=leaderboard_df[
|
116 |
+
[c.name for c in fields(AutoEvalColumnQA) if c.never_hidden]
|
117 |
+ shown_columns.value
|
118 |
],
|
119 |
+
headers=[c.name for c in fields(AutoEvalColumnQA) if c.never_hidden] + shown_columns.value,
|
120 |
datatype=TYPES,
|
121 |
elem_id="leaderboard-table",
|
122 |
interactive=False,
|
|
|
125 |
|
126 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
127 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
128 |
+
value=original_df_qa[COLS],
|
129 |
headers=COLS,
|
130 |
datatype=TYPES,
|
131 |
visible=False,
|
src/benchmarks.py
CHANGED
@@ -106,9 +106,12 @@ metric_list = [
|
|
106 |
|
107 |
@dataclass
|
108 |
class Benchmark:
|
109 |
-
name: str # [
|
110 |
metric: str # ndcg_at_1 ,metric_key in the json file
|
111 |
col_name: str # [domain]_[language], name to display in the leaderboard
|
|
|
|
|
|
|
112 |
|
113 |
qa_benchmark_dict = {}
|
114 |
long_doc_benchmark_dict = {}
|
@@ -116,18 +119,20 @@ for task, domain_dict in dataset_dict.items():
|
|
116 |
for domain, lang_dict in domain_dict.items():
|
117 |
for lang, dataset_list in lang_dict.items():
|
118 |
if task == "qa":
|
119 |
-
benchmark_name = f"{
|
120 |
benchmark_name = get_safe_name(benchmark_name)
|
121 |
col_name = f"{domain}_{lang}"
|
122 |
for metric in dataset_list:
|
123 |
-
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
|
124 |
elif task == "long_doc":
|
125 |
for dataset in dataset_list:
|
126 |
col_name = f"{domain}_{lang}_{dataset}"
|
127 |
for metric in metric_list:
|
128 |
-
benchmark_name = f"{
|
129 |
benchmark_name = get_safe_name(benchmark_name)
|
130 |
-
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
|
131 |
|
132 |
BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
|
133 |
BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
|
|
|
|
|
|
106 |
|
107 |
@dataclass
|
108 |
class Benchmark:
|
109 |
+
name: str # [domain]_[language]_[metric], task_key in the json file,
|
110 |
metric: str # ndcg_at_1 ,metric_key in the json file
|
111 |
col_name: str # [domain]_[language], name to display in the leaderboard
|
112 |
+
domain: str
|
113 |
+
lang: str
|
114 |
+
task: str
|
115 |
|
116 |
qa_benchmark_dict = {}
|
117 |
long_doc_benchmark_dict = {}
|
|
|
119 |
for domain, lang_dict in domain_dict.items():
|
120 |
for lang, dataset_list in lang_dict.items():
|
121 |
if task == "qa":
|
122 |
+
benchmark_name = f"{domain}_{lang}"
|
123 |
benchmark_name = get_safe_name(benchmark_name)
|
124 |
col_name = f"{domain}_{lang}"
|
125 |
for metric in dataset_list:
|
126 |
+
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
127 |
elif task == "long_doc":
|
128 |
for dataset in dataset_list:
|
129 |
col_name = f"{domain}_{lang}_{dataset}"
|
130 |
for metric in metric_list:
|
131 |
+
benchmark_name = f"{domain}_{lang}_{dataset}_{metric}"
|
132 |
benchmark_name = get_safe_name(benchmark_name)
|
133 |
+
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
134 |
|
135 |
BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
|
136 |
BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
|
137 |
+
|
138 |
+
BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
|
src/populate.py
CHANGED
@@ -9,16 +9,17 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
|
9 |
from typing import Tuple
|
10 |
|
11 |
|
12 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
all_data_json = []
|
16 |
for v in raw_data:
|
17 |
-
all_data_json += v.to_dict()
|
18 |
|
19 |
df = pd.DataFrame.from_records(all_data_json)
|
20 |
-
df[
|
21 |
-
|
|
|
22 |
df = df[cols].round(decimals=2)
|
23 |
|
24 |
# filter out if any of the benchmarks have not been produced
|
|
|
9 |
from typing import Tuple
|
10 |
|
11 |
|
12 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task: str, metric: str) -> Tuple[list[EvalResult], pd.DataFrame]:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
all_data_json = []
|
16 |
for v in raw_data:
|
17 |
+
all_data_json += v.to_dict(task=task, metric=metric)
|
18 |
|
19 |
df = pd.DataFrame.from_records(all_data_json)
|
20 |
+
df[AutoEvalColumnQA.average.name] = df[benchmark_cols].mean(axis=1)
|
21 |
+
df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
|
22 |
+
df.reset_index(inplace=True)
|
23 |
df = df[cols].round(decimals=2)
|
24 |
|
25 |
# filter out if any of the benchmarks have not been produced
|
tests/src/test_populate.py
CHANGED
@@ -11,10 +11,13 @@ def test_get_leaderboard_df():
|
|
11 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
12 |
raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
|
13 |
assert df.shape[0] == 2
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
|
19 |
|
20 |
|
|
|
11 |
benchmark_cols = ['wiki_en', 'wiki_zh',]
|
12 |
raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
|
13 |
assert df.shape[0] == 2
|
14 |
+
# the results contains only one embedding model
|
15 |
+
for i in range(2):
|
16 |
+
assert df["Retrieval Model"][i] == "bge-m3"
|
17 |
+
# the results contains only two reranking model
|
18 |
+
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
19 |
+
assert df["Reranking Model"][1] == "NoReranker"
|
20 |
+
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
21 |
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
|
22 |
|
23 |
|
tests/test_utils.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pytest
|
3 |
+
|
4 |
+
from utils import filter_models, search_table, filter_queries, select_columns
|
5 |
+
|
6 |
+
|
7 |
+
@pytest.fixture
|
8 |
+
def toy_df():
|
9 |
+
return pd.DataFrame(
|
10 |
+
{
|
11 |
+
"Retrieval Model": [
|
12 |
+
"bge-m3",
|
13 |
+
"bge-m3",
|
14 |
+
"jina-embeddings-v2-base",
|
15 |
+
"jina-embeddings-v2-base"
|
16 |
+
],
|
17 |
+
"Reranking Model": [
|
18 |
+
"bge-reranker-v2-m3",
|
19 |
+
"NoReranker",
|
20 |
+
"bge-reranker-v2-m3",
|
21 |
+
"NoReranker"
|
22 |
+
],
|
23 |
+
"Average ⬆️": [0.6, 0.4, 0.3, 0.2],
|
24 |
+
"wiki_en": [0.8, 0.7, 0.2, 0.1],
|
25 |
+
"wiki_zh": [0.4, 0.1, 0.4, 0.3],
|
26 |
+
"news_en": [0.8, 0.7, 0.2, 0.1],
|
27 |
+
"news_zh": [0.4, 0.1, 0.4, 0.3],
|
28 |
+
}
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
def test_filter_models(toy_df):
|
33 |
+
df_result = filter_models(toy_df, ["bge-reranker-v2-m3", ])
|
34 |
+
assert len(df_result) == 2
|
35 |
+
assert df_result.iloc[0]["Reranking Model"] == "bge-reranker-v2-m3"
|
36 |
+
|
37 |
+
|
38 |
+
def test_search_table(toy_df):
|
39 |
+
df_result = search_table(toy_df, "jina")
|
40 |
+
assert len(df_result) == 2
|
41 |
+
assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
|
42 |
+
|
43 |
+
|
44 |
+
def test_filter_queries(toy_df):
|
45 |
+
df_result = filter_queries("jina", toy_df)
|
46 |
+
assert len(df_result) == 2
|
47 |
+
assert df_result.iloc[0]["Retrieval Model"] == "jina-embeddings-v2-base"
|
48 |
+
|
49 |
+
|
50 |
+
def test_select_columns(toy_df):
|
51 |
+
df_result = select_columns(toy_df, ['news',], ['zh',])
|
52 |
+
assert len(df_result.columns) == 4
|
53 |
+
assert df_result['Average ⬆️'].equals(df_result['news_zh'])
|
tests/toydata/test_results/bge-m3/NoReranker/results_demo_2023-12-21T18-10-08.json
CHANGED
@@ -27,7 +27,7 @@
|
|
27 |
"domain": "wiki",
|
28 |
"lang": "en",
|
29 |
"dataset": "unknown",
|
30 |
-
"value": 0.
|
31 |
}
|
32 |
]
|
33 |
},
|
|
|
27 |
"domain": "wiki",
|
28 |
"lang": "en",
|
29 |
"dataset": "unknown",
|
30 |
+
"value": 0.39083
|
31 |
}
|
32 |
]
|
33 |
},
|
utils.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
from src.display.utils import AutoEvalColumnQA, COLS
|
4 |
+
from src.benchmarks import BENCHMARK_COLS_QA, BenchmarksQA
|
5 |
+
|
6 |
+
|
7 |
+
def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
|
8 |
+
return df.loc[df["Reranking Model"].isin(reranking_query)]
|
9 |
+
|
10 |
+
|
11 |
+
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
12 |
+
final_df = []
|
13 |
+
if query != "":
|
14 |
+
queries = [q.strip() for q in query.split(";")]
|
15 |
+
for _q in queries:
|
16 |
+
_q = _q.strip()
|
17 |
+
if _q != "":
|
18 |
+
temp_filtered_df = search_table(filtered_df, _q)
|
19 |
+
if len(temp_filtered_df) > 0:
|
20 |
+
final_df.append(temp_filtered_df)
|
21 |
+
if len(final_df) > 0:
|
22 |
+
filtered_df = pd.concat(final_df)
|
23 |
+
filtered_df = filtered_df.drop_duplicates(
|
24 |
+
subset=[
|
25 |
+
AutoEvalColumnQA.retrieval_model.name,
|
26 |
+
AutoEvalColumnQA.reranking_model.name,
|
27 |
+
]
|
28 |
+
)
|
29 |
+
|
30 |
+
return filtered_df
|
31 |
+
|
32 |
+
|
33 |
+
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
34 |
+
return df[(df[AutoEvalColumnQA.retrieval_model.name].str.contains(query, case=False))]
|
35 |
+
|
36 |
+
|
37 |
+
def select_columns(df: pd.DataFrame, domain_query: list, language_query: list) -> pd.DataFrame:
|
38 |
+
always_here_cols = [
|
39 |
+
AutoEvalColumnQA.retrieval_model.name,
|
40 |
+
AutoEvalColumnQA.reranking_model.name,
|
41 |
+
AutoEvalColumnQA.average.name
|
42 |
+
]
|
43 |
+
selected_cols = []
|
44 |
+
for c in COLS:
|
45 |
+
if c not in df.columns:
|
46 |
+
continue
|
47 |
+
if c not in BENCHMARK_COLS_QA:
|
48 |
+
continue
|
49 |
+
eval_col = BenchmarksQA[c].value
|
50 |
+
if eval_col.domain not in domain_query:
|
51 |
+
continue
|
52 |
+
if eval_col.lang not in language_query:
|
53 |
+
continue
|
54 |
+
selected_cols.append(c)
|
55 |
+
# We use COLS to maintain sorting
|
56 |
+
filtered_df = df[always_here_cols + selected_cols]
|
57 |
+
filtered_df[AutoEvalColumnQA.average.name] = filtered_df[selected_cols].mean(axis=1)
|
58 |
+
return filtered_df
|
59 |
+
|
60 |
+
|
61 |
+
def update_table(
|
62 |
+
hidden_df: pd.DataFrame,
|
63 |
+
columns: list,
|
64 |
+
reranking_query: list,
|
65 |
+
query: str,
|
66 |
+
):
|
67 |
+
filtered_df = filter_models(hidden_df, reranking_query)
|
68 |
+
filtered_df = filter_queries(query, filtered_df)
|
69 |
+
df = select_columns(filtered_df, columns)
|
70 |
+
return df
|