Spaces:
Runtime error
Runtime error
tasks
Browse files- src/about.py +58 -1
- src/display/utils.py +6 -6
- src/leaderboard/read_evals.py +16 -16
src/about.py
CHANGED
@@ -43,7 +43,64 @@ class Tasks(Enum):
|
|
43 |
# task27 = Task("polish_poquad_reranking", "acc,none", "poquad_reranking", "other", 0.0)
|
44 |
# task28 = Task("polish_abstractive_poquad_rag", "levenshtein,none", "abstractive_poquad_rag", "other", 0.0)
|
45 |
# task29 = Task("polish_abstractive_poquad_open_book", "levenshtein,none", "abstractive_poquad_open_book", "other", 0.0)
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
|
49 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
|
|
43 |
# task27 = Task("polish_poquad_reranking", "acc,none", "poquad_reranking", "other", 0.0)
|
44 |
# task28 = Task("polish_abstractive_poquad_rag", "levenshtein,none", "abstractive_poquad_rag", "other", 0.0)
|
45 |
# task29 = Task("polish_abstractive_poquad_open_book", "levenshtein,none", "abstractive_poquad_open_book", "other", 0.0)
|
46 |
+
task30a = Task("polish_pes_regex", "exact_match,score-first", "pes", "generate_until", 0.2)
|
47 |
+
task30 = Task("polish_pes_alergologia", "exact_match,score-first", "alergologia", "generate_until", 0.2)
|
48 |
+
task31 = Task("polish_pes_anestezjologia", "exact_match,score-first", "anestezjologia", "generate_until", 0.2)
|
49 |
+
task32 = Task("polish_pes_angiologia", "exact_match,score-first", "angiologia", "generate_until", 0.2)
|
50 |
+
task33 = Task("polish_pes_balneologia_i_medycyna_fizykalna", "exact_match,score-first", "balneologia_i_medycyna_fizykalna", "generate_until", 0.2)
|
51 |
+
task34 = Task("polish_pes_chirurgia_dziecieca", "exact_match,score-first", "chirurgia_dziecieca", "generate_until", 0.2)
|
52 |
+
task35 = Task("polish_pes_chirurgia_naczyniowa", "exact_match,score-first", "chirurgia_naczyniowa", "generate_until", 0.2)
|
53 |
+
task36 = Task("polish_pes_chirurgia_ogolna", "exact_match,score-first", "chirurgia_ogolna", "generate_until", 0.2)
|
54 |
+
task37 = Task("polish_pes_chirurgia_onkologiczna", "exact_match,score-first", "chirurgia_onkologiczna", "generate_until", 0.2)
|
55 |
+
task38 = Task("polish_pes_chirurgia_stomatologiczna", "exact_match,score-first", "chirurgia_stomatologiczna", "generate_until", 0.2)
|
56 |
+
task39 = Task("polish_pes_chirurgia_szczekowo-twarzowa", "exact_match,score-first", "chirurgia_szczekowo-twarzowa", "generate_until", 0.2)
|
57 |
+
task40 = Task("polish_pes_choroby_pluc", "exact_match,score-first", "choroby_pluc", "generate_until", 0.2)
|
58 |
+
task41 = Task("polish_pes_choroby_pluc_dzieci", "exact_match,score-first", "choroby_pluc_dzieci", "generate_until", 0.2)
|
59 |
+
task42 = Task("polish_pes_choroby_wewnetrzne", "exact_match,score-first", "choroby_wewnetrzne", "generate_until", 0.2)
|
60 |
+
task43 = Task("polish_pes_choroby_zakazne", "exact_match,score-first", "choroby_zakazne", "generate_until", 0.2)
|
61 |
+
task44 = Task("polish_pes_dermatologia_i_wenerologia", "exact_match,score-first", "dermatologia_i_wenerologia", "generate_until", 0.2)
|
62 |
+
task45 = Task("polish_pes_diabetologia", "exact_match,score-first", "diabetologia", "generate_until", 0.2)
|
63 |
+
task46 = Task("polish_pes_endokrynologia", "exact_match,score-first", "endokrynologia", "generate_until", 0.2)
|
64 |
+
task47 = Task("polish_pes_endokrynologia_ginekologiczna_i_rozrodczosc", "exact_match,score-first", "endokrynologia_ginekologiczna_i_rozrodczosc", "generate_until", 0.2)
|
65 |
+
task48 = Task("polish_pes_endokrynologia_i_diabetologia_dziecieca", "exact_match,score-first", "endokrynologia_i_diabetologia_dziecieca", "generate_until", 0.2)
|
66 |
+
task49 = Task("polish_pes_gastroenterologia", "exact_match,score-first", "gastroenterologia", "generate_until", 0.2)
|
67 |
+
task50 = Task("polish_pes_gastroenterologia_dziecieca", "exact_match,score-first", "gastroenterologia_dziecieca", "generate_until", 0.2)
|
68 |
+
task51 = Task("polish_pes_geriatria", "exact_match,score-first", "geriatria", "generate_until", 0.2)
|
69 |
+
task52 = Task("polish_pes_ginekologia_onkologiczna", "exact_match,score-first", "ginekologia_onkologiczna", "generate_until", 0.2)
|
70 |
+
task53 = Task("polish_pes_hematologia", "exact_match,score-first", "hematologia", "generate_until", 0.2)
|
71 |
+
task54 = Task("polish_pes_hipertensjologia", "exact_match,score-first", "hipertensjologia", "generate_until", 0.2)
|
72 |
+
task55 = Task("polish_pes_kardiochirurgia", "exact_match,score-first", "kardiochirurgia", "generate_until", 0.2)
|
73 |
+
task56 = Task("polish_pes_kardiologia", "exact_match,score-first", "kardiologia", "generate_until", 0.2)
|
74 |
+
task57 = Task("polish_pes_medycyna_pracy", "exact_match,score-first", "medycyna_pracy", "generate_until", 0.2)
|
75 |
+
task58 = Task("polish_pes_medycyna_paliatywna", "exact_match,score-first", "medycyna_paliatywna", "generate_until", 0.2)
|
76 |
+
task59 = Task("polish_pes_medycyna_ratunkowa", "exact_match,score-first", "medycyna_ratunkowa", "generate_until", 0.2)
|
77 |
+
task60 = Task("polish_pes_medycyna_rodzinna", "exact_match,score-first", "medycyna_rodzinna", "generate_until", 0.2)
|
78 |
+
task61 = Task("polish_pes_medycyna_sportowa", "exact_match,score-first", "medycyna_sportowa", "generate_until", 0.2)
|
79 |
+
task62 = Task("polish_pes_nefrologia", "exact_match,score-first", "nefrologia", "generate_until", 0.2)
|
80 |
+
task63 = Task("polish_pes_neonatologia", "exact_match,score-first", "neonatologia", "generate_until", 0.2)
|
81 |
+
task64 = Task("polish_pes_neurochirurgia", "exact_match,score-first", "neurochirurgia", "generate_until", 0.2)
|
82 |
+
task65 = Task("polish_pes_neurologia", "exact_match,score-first", "neurologia", "generate_until", 0.2)
|
83 |
+
task66 = Task("polish_pes_neurologia_dziecieca", "exact_match,score-first", "neurologia_dziecieca", "generate_until", 0.2)
|
84 |
+
task67 = Task("polish_pes_okulistyka", "exact_match,score-first", "okulistyka", "generate_until", 0.2)
|
85 |
+
task68 = Task("polish_pes_onkologia_kliniczna", "exact_match,score-first", "onkologia_kliniczna", "generate_until", 0.2)
|
86 |
+
task69 = Task("polish_pes_ortodoncja", "exact_match,score-first", "ortodoncja", "generate_until", 0.2)
|
87 |
+
task70 = Task("polish_pes_ortopedia", "exact_match,score-first", "ortopedia", "generate_until", 0.2)
|
88 |
+
task71 = Task("polish_pes_otolaryngologia", "exact_match,score-first", "otolaryngologia", "generate_until", 0.2)
|
89 |
+
task72 = Task("polish_pes_patomorfologia", "exact_match,score-first", "patomorfologia", "generate_until", 0.2)
|
90 |
+
task73 = Task("polish_pes_pediatria", "exact_match,score-first", "pediatria", "generate_until", 0.2)
|
91 |
+
task74 = Task("polish_pes_perinatologia", "exact_match,score-first", "perinatologia", "generate_until", 0.2)
|
92 |
+
task75 = Task("polish_pes_periodontologia", "exact_match,score-first", "periodontologia", "generate_until", 0.2)
|
93 |
+
task76 = Task("polish_pes_poloznictwo_i_ginekologia", "exact_match,score-first", "poloznictwo_i_ginekologia", "generate_until", 0.2)
|
94 |
+
task77 = Task("polish_pes_protetyka_stomatologiczna", "exact_match,score-first", "protetyka_stomatologiczna", "generate_until", 0.2)
|
95 |
+
task78 = Task("polish_pes_psychiatria", "exact_match,score-first", "psychiatria", "generate_until", 0.2)
|
96 |
+
task79 = Task("polish_pes_psychiatria_dzieci_i_mlodziezy", "exact_match,score-first", "psychiatria_dzieci_i_mlodziezy", "generate_until", 0.2)
|
97 |
+
task80 = Task("polish_pes_radiologia_i_diagnostyka_obrazowa", "exact_match,score-first", "radiologia_i_diagnostyka_obrazowa", "generate_until", 0.2)
|
98 |
+
task81 = Task("polish_pes_radioterapia_onkologiczna", "exact_match,score-first", "radioterapia_onkologiczna", "generate_until", 0.2)
|
99 |
+
task82 = Task("polish_pes_rehabilitacja_medyczna", "exact_match,score-first", "rehabilitacja_medyczna", "generate_until", 0.2)
|
100 |
+
task83 = Task("polish_pes_reumatologia", "exact_match,score-first", "reumatologia", "generate_until", 0.2)
|
101 |
+
task84 = Task("polish_pes_stomatologia_dziecieca", "exact_match,score-first", "stomatologia_dziecieca", "generate_until", 0.2)
|
102 |
+
task85 = Task("polish_pes_stomatologia_zachowawcza", "exact_match,score-first", "stomatologia_zachowawcza", "generate_until", 0.2)
|
103 |
+
task86 = Task("polish_pes_transplantologia_kliniczna", "exact_match,score-first", "transplantologia_kliniczna", "generate_until", 0.2)
|
104 |
|
105 |
|
106 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
src/display/utils.py
CHANGED
@@ -32,14 +32,14 @@ auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "
|
|
32 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
33 |
#Scores
|
34 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
35 |
-
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
36 |
-
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
37 |
-
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
38 |
-
auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
|
39 |
|
40 |
for task in Tasks:
|
41 |
-
show = task.value.col_name not in ['poquad_reranking','abstractive_poquad_rag','abstractive_poquad_open_book', 'pes_g']
|
42 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number",
|
43 |
# Model information
|
44 |
|
45 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
32 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
33 |
#Scores
|
34 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
35 |
+
# auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
36 |
+
# auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
37 |
+
# auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
38 |
+
# auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
|
39 |
|
40 |
for task in Tasks:
|
41 |
+
# show = task.value.col_name not in ['poquad_reranking','abstractive_poquad_rag','abstractive_poquad_open_book', 'pes_g']
|
42 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
43 |
# Model information
|
44 |
|
45 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -283,30 +283,30 @@ class EvalResult:
|
|
283 |
except AttributeError:
|
284 |
print(f"AttributeError revision")
|
285 |
|
286 |
-
try:
|
287 |
-
|
288 |
-
except KeyError:
|
289 |
-
|
290 |
|
291 |
try:
|
292 |
data_dict[AutoEvalColumn.average.name] = average
|
293 |
except KeyError:
|
294 |
print(f"Could not find average")
|
295 |
|
296 |
-
try:
|
297 |
-
|
298 |
-
except KeyError:
|
299 |
-
|
300 |
|
301 |
-
try:
|
302 |
-
|
303 |
-
except KeyError:
|
304 |
-
|
305 |
|
306 |
-
try:
|
307 |
-
|
308 |
-
except KeyError:
|
309 |
-
|
310 |
|
311 |
try:
|
312 |
data_dict[AutoEvalColumn.license.name] = self.license
|
|
|
283 |
except AttributeError:
|
284 |
print(f"AttributeError revision")
|
285 |
|
286 |
+
# try:
|
287 |
+
# data_dict[AutoEvalColumn.average_old.name] = average_old
|
288 |
+
# except KeyError:
|
289 |
+
# print(f"Could not find average_old")
|
290 |
|
291 |
try:
|
292 |
data_dict[AutoEvalColumn.average.name] = average
|
293 |
except KeyError:
|
294 |
print(f"Could not find average")
|
295 |
|
296 |
+
# try:
|
297 |
+
# data_dict[AutoEvalColumn.average_g.name] = average_g
|
298 |
+
# except KeyError:
|
299 |
+
# print(f"Could not find average_g")
|
300 |
|
301 |
+
# try:
|
302 |
+
# data_dict[AutoEvalColumn.average_mc.name] = average_mc
|
303 |
+
# except KeyError:
|
304 |
+
# print(f"Could not find average_mc")
|
305 |
|
306 |
+
# try:
|
307 |
+
# data_dict[AutoEvalColumn.average_rag.name] = average_rag
|
308 |
+
# except KeyError:
|
309 |
+
# print(f"Could not find average_rag")
|
310 |
|
311 |
try:
|
312 |
data_dict[AutoEvalColumn.license.name] = self.license
|