“WadoodAbdul”
commited on
Commit
•
f738aa2
1
Parent(s):
3aa629d
added evaluation metric type radio button
Browse files- app.py +85 -4
- src/display/utils.py +3 -0
- src/leaderboard/read_evals.py +5 -5
- src/populate.py +2 -2
app.py
CHANGED
@@ -60,12 +60,20 @@ try:
|
|
60 |
except Exception:
|
61 |
restart_space()
|
62 |
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "clinical_types")
|
68 |
-
types_leaderboard_df = types_original_df.copy()
|
69 |
|
70 |
(
|
71 |
finished_eval_queue_df,
|
@@ -74,6 +82,36 @@ types_leaderboard_df = types_original_df.copy()
|
|
74 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
75 |
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# Searching and filtering
|
78 |
def update_table(
|
79 |
hidden_df: pd.DataFrame,
|
@@ -249,6 +287,12 @@ with demo:
|
|
249 |
# )
|
250 |
with gr.Column(min_width=320):
|
251 |
# with gr.Box(elem_id="box-filter"):
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
filter_columns_type = gr.CheckboxGroup(
|
253 |
label="Model Types",
|
254 |
choices=[t.to_str() for t in ModelType],
|
@@ -270,6 +314,9 @@ with demo:
|
|
270 |
# interactive=True,
|
271 |
# elem_id="filter-columns-size",
|
272 |
# )
|
|
|
|
|
|
|
273 |
leaderboard_table = gr.components.Dataframe(
|
274 |
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
275 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
@@ -286,6 +333,19 @@ with demo:
|
|
286 |
datatype=TYPES,
|
287 |
visible=False,
|
288 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
search_bar.submit(
|
290 |
update_table,
|
291 |
[
|
@@ -317,6 +377,7 @@ with demo:
|
|
317 |
queue=True,
|
318 |
)
|
319 |
|
|
|
320 |
with gr.TabItem("🏅 Clinical Types", elem_id="llm-benchmark-tab-table", id=4):
|
321 |
with gr.Row():
|
322 |
with gr.Column():
|
@@ -343,6 +404,12 @@ with demo:
|
|
343 |
# value=False, label="Show gated/private/deleted models", interactive=True
|
344 |
# )
|
345 |
with gr.Column(min_width=320):
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
# with gr.Box(elem_id="box-filter"):
|
347 |
filter_columns_type = gr.CheckboxGroup(
|
348 |
label="Model Types",
|
@@ -372,6 +439,7 @@ with demo:
|
|
372 |
# interactive=True,
|
373 |
# elem_id="filter-columns-size",
|
374 |
# )
|
|
|
375 |
|
376 |
leaderboard_table = gr.components.Dataframe(
|
377 |
value=types_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
@@ -389,6 +457,19 @@ with demo:
|
|
389 |
datatype=TYPES,
|
390 |
visible=False,
|
391 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
search_bar.submit(
|
393 |
update_table,
|
394 |
[
|
|
|
60 |
except Exception:
|
61 |
restart_space()
|
62 |
|
63 |
+
# Span based results
|
64 |
+
_, span_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "SpanBasedWithPartialOverlap", "datasets")
|
65 |
+
span_based_datasets_leaderboard_df = span_based_datasets_original_df.copy()
|
66 |
|
67 |
+
_, span_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "SpanBasedWithPartialOverlap", "clinical_types")
|
68 |
+
span_based_types_leaderboard_df = span_based_types_original_df.copy()
|
69 |
+
|
70 |
+
# Token based results
|
71 |
+
_, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
72 |
+
token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
|
73 |
+
|
74 |
+
_, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
|
75 |
+
token_based_types_leaderboard_df = token_based_types_original_df.copy()
|
76 |
|
|
|
|
|
77 |
|
78 |
(
|
79 |
finished_eval_queue_df,
|
|
|
82 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
83 |
|
84 |
|
85 |
+
def update_df(evaluation_metric, shown_columns, subset="datasets"):
|
86 |
+
print(evaluation_metric)
|
87 |
+
|
88 |
+
if subset == "datasets":
|
89 |
+
match evaluation_metric:
|
90 |
+
case "Span Based":
|
91 |
+
leaderboard_table_df = span_based_datasets_leaderboard_df.copy()
|
92 |
+
hidden_leader_board_df = span_based_datasets_original_df
|
93 |
+
case "Token Based":
|
94 |
+
leaderboard_table_df = token_based_datasets_leaderboard_df.copy()
|
95 |
+
hidden_leader_board_df = token_based_datasets_original_df
|
96 |
+
case _:
|
97 |
+
pass
|
98 |
+
else:
|
99 |
+
match evaluation_metric:
|
100 |
+
case "Span Based":
|
101 |
+
leaderboard_table_df = span_based_types_leaderboard_df.copy()
|
102 |
+
hidden_leader_board_df = span_based_types_original_df
|
103 |
+
case "Token Based":
|
104 |
+
leaderboard_table_df = token_based_types_leaderboard_df.copy()
|
105 |
+
hidden_leader_board_df = token_based_types_original_df
|
106 |
+
case _:
|
107 |
+
pass
|
108 |
+
|
109 |
+
|
110 |
+
value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
|
111 |
+
|
112 |
+
return leaderboard_table_df[value_cols], hidden_leader_board_df
|
113 |
+
|
114 |
+
|
115 |
# Searching and filtering
|
116 |
def update_table(
|
117 |
hidden_df: pd.DataFrame,
|
|
|
287 |
# )
|
288 |
with gr.Column(min_width=320):
|
289 |
# with gr.Box(elem_id="box-filter"):
|
290 |
+
|
291 |
+
eval_metric = gr.Radio(
|
292 |
+
choices=["Span Based", "Token Based"],
|
293 |
+
value = "Span Based",
|
294 |
+
label="Evaluation Metric",
|
295 |
+
)
|
296 |
filter_columns_type = gr.CheckboxGroup(
|
297 |
label="Model Types",
|
298 |
choices=[t.to_str() for t in ModelType],
|
|
|
314 |
# interactive=True,
|
315 |
# elem_id="filter-columns-size",
|
316 |
# )
|
317 |
+
|
318 |
+
datasets_leaderboard_df, datasets_original_df = update_df(eval_metric.value, shown_columns.value, subset="datasets")
|
319 |
+
|
320 |
leaderboard_table = gr.components.Dataframe(
|
321 |
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
322 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
|
|
333 |
datatype=TYPES,
|
334 |
visible=False,
|
335 |
)
|
336 |
+
|
337 |
+
eval_metric.change(
|
338 |
+
lambda a, b: update_df(a,b, "datasets") ,
|
339 |
+
inputs=[
|
340 |
+
eval_metric,
|
341 |
+
shown_columns,
|
342 |
+
],
|
343 |
+
outputs=[
|
344 |
+
leaderboard_table,
|
345 |
+
hidden_leaderboard_table_for_search,
|
346 |
+
]
|
347 |
+
)
|
348 |
+
|
349 |
search_bar.submit(
|
350 |
update_table,
|
351 |
[
|
|
|
377 |
queue=True,
|
378 |
)
|
379 |
|
380 |
+
|
381 |
with gr.TabItem("🏅 Clinical Types", elem_id="llm-benchmark-tab-table", id=4):
|
382 |
with gr.Row():
|
383 |
with gr.Column():
|
|
|
404 |
# value=False, label="Show gated/private/deleted models", interactive=True
|
405 |
# )
|
406 |
with gr.Column(min_width=320):
|
407 |
+
|
408 |
+
eval_metric = gr.Radio(
|
409 |
+
choices=["Span Based", "Token Based"],
|
410 |
+
value = "Span Based",
|
411 |
+
label="Evaluation Metric",
|
412 |
+
)
|
413 |
# with gr.Box(elem_id="box-filter"):
|
414 |
filter_columns_type = gr.CheckboxGroup(
|
415 |
label="Model Types",
|
|
|
439 |
# interactive=True,
|
440 |
# elem_id="filter-columns-size",
|
441 |
# )
|
442 |
+
types_leaderboard_df, types_original_df = update_df(eval_metric.value, shown_columns.value, subset="clinical_types")
|
443 |
|
444 |
leaderboard_table = gr.components.Dataframe(
|
445 |
value=types_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
|
|
457 |
datatype=TYPES,
|
458 |
visible=False,
|
459 |
)
|
460 |
+
|
461 |
+
eval_metric.change(
|
462 |
+
fn=lambda a, b: update_df(a,b, "clinical_types"),
|
463 |
+
inputs=[
|
464 |
+
eval_metric,
|
465 |
+
shown_columns,
|
466 |
+
],
|
467 |
+
outputs=[
|
468 |
+
leaderboard_table,
|
469 |
+
hidden_leaderboard_table_for_search
|
470 |
+
]
|
471 |
+
)
|
472 |
+
|
473 |
search_bar.submit(
|
474 |
update_table,
|
475 |
[
|
src/display/utils.py
CHANGED
@@ -162,6 +162,9 @@ class PromptTemplateName(Enum):
|
|
162 |
LLamaNERTemplate = "llama_70B_ner"
|
163 |
# MixtralNERTemplate = "mixtral_ner_v0.3"
|
164 |
|
|
|
|
|
|
|
165 |
|
166 |
|
167 |
# Column selection
|
|
|
162 |
LLamaNERTemplate = "llama_70B_ner"
|
163 |
# MixtralNERTemplate = "mixtral_ner_v0.3"
|
164 |
|
165 |
+
class EvaluationMetrics(Enum):
|
166 |
+
SpanBased = "Span Based"
|
167 |
+
TokenBased = "Token Based"
|
168 |
|
169 |
|
170 |
# Column selection
|
src/leaderboard/read_evals.py
CHANGED
@@ -36,7 +36,7 @@ class EvalResult:
|
|
36 |
display_result:bool = True
|
37 |
|
38 |
@classmethod
|
39 |
-
def init_from_json_file(self, json_filepath):
|
40 |
"""Inits the result from the specific model result file"""
|
41 |
with open(json_filepath) as fp:
|
42 |
data = json.load(fp)
|
@@ -82,7 +82,7 @@ class EvalResult:
|
|
82 |
task = task.value
|
83 |
|
84 |
# We average all scores of a given metric (not all metrics are present in all files)
|
85 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["dataset_results"].items() if task.benchmark == k])
|
86 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
87 |
continue
|
88 |
|
@@ -94,7 +94,7 @@ class EvalResult:
|
|
94 |
clinical_type = clinical_type.value
|
95 |
|
96 |
# We average all scores of a given metric (not all metrics are present in all files)
|
97 |
-
accs = np.array([v.get(clinical_type.metric, None) for k, v in data["clinical_type_results"].items() if clinical_type.benchmark == k])
|
98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
99 |
continue
|
100 |
|
@@ -212,7 +212,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
212 |
return request_file
|
213 |
|
214 |
|
215 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
216 |
"""From the path of the results folder root, extract all needed info for results"""
|
217 |
model_result_filepaths = []
|
218 |
|
@@ -233,7 +233,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
233 |
eval_results = {}
|
234 |
for model_result_filepath in model_result_filepaths:
|
235 |
# Creation of result
|
236 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
237 |
eval_result.update_with_request_file(requests_path)
|
238 |
|
239 |
# Store results of same eval together
|
|
|
36 |
display_result:bool = True
|
37 |
|
38 |
@classmethod
|
39 |
+
def init_from_json_file(self, json_filepath, evaluation_metric):
|
40 |
"""Inits the result from the specific model result file"""
|
41 |
with open(json_filepath) as fp:
|
42 |
data = json.load(fp)
|
|
|
82 |
task = task.value
|
83 |
|
84 |
# We average all scores of a given metric (not all metrics are present in all files)
|
85 |
+
accs = np.array([v.get(task.metric, None) for k, v in data[evaluation_metric]["dataset_results"].items() if task.benchmark == k])
|
86 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
87 |
continue
|
88 |
|
|
|
94 |
clinical_type = clinical_type.value
|
95 |
|
96 |
# We average all scores of a given metric (not all metrics are present in all files)
|
97 |
+
accs = np.array([v.get(clinical_type.metric, None) for k, v in data[evaluation_metric]["clinical_type_results"].items() if clinical_type.benchmark == k])
|
98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
99 |
continue
|
100 |
|
|
|
212 |
return request_file
|
213 |
|
214 |
|
215 |
+
def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
|
216 |
"""From the path of the results folder root, extract all needed info for results"""
|
217 |
model_result_filepaths = []
|
218 |
|
|
|
233 |
eval_results = {}
|
234 |
for model_result_filepath in model_result_filepaths:
|
235 |
# Creation of result
|
236 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, evaluation_metric)
|
237 |
eval_result.update_with_request_file(requests_path)
|
238 |
|
239 |
# Store results of same eval together
|
src/populate.py
CHANGED
@@ -8,9 +8,9 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, subset:str) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict(subset=subset) for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
|
14 |
all_data_json = [v.to_dict(subset=subset) for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|