Spaces:
Running
Running
ADD: BoolQ, TurthfulQA
#5
by
Cookize
- opened
tasks.py
CHANGED
@@ -209,6 +209,7 @@ def multichoice_zh(responses: Any, references: list[str]):
|
|
209 |
class Metrics:
|
210 |
cmmlu = multichoice_zh
|
211 |
mmlu = multichoice
|
|
|
212 |
ceval = multichoice_zh
|
213 |
|
214 |
def winogrande(responses: list[str], answers: list[str | int]):
|
@@ -269,6 +270,13 @@ class Metrics:
|
|
269 |
|
270 |
return responses, answers
|
271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
def MATH(responses: list[str], answers: list[str]):
|
273 |
extract_responses = sync_pipe(get_answer)(responses)
|
274 |
extract_answers = sync_pipe(get_answer)(answers)
|
@@ -293,7 +301,7 @@ class CMMLU:
|
|
293 |
for choice in list("ABCD"):
|
294 |
prompt += f"\n{choice}. {example[choice]}"
|
295 |
|
296 |
-
|
297 |
return {"prompt": prompt}
|
298 |
|
299 |
subcategories = {
|
@@ -808,6 +816,81 @@ class BBH:
|
|
808 |
return suite
|
809 |
|
810 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
class CEVAL:
|
812 |
input_column = "input"
|
813 |
label_column = "answer"
|
|
|
209 |
class Metrics:
|
210 |
cmmlu = multichoice_zh
|
211 |
mmlu = multichoice
|
212 |
+
truthful_qa_mc1 = multichoice
|
213 |
ceval = multichoice_zh
|
214 |
|
215 |
def winogrande(responses: list[str], answers: list[str | int]):
|
|
|
270 |
|
271 |
return responses, answers
|
272 |
|
273 |
+
def boolq(responses: list[str], answers: list[str | int]):
|
274 |
+
|
275 |
+
responses = [first_capital_postprocess(response) for response in responses]
|
276 |
+
answers = ["A" if answer else "B" for answer in answers]
|
277 |
+
|
278 |
+
return responses, answers
|
279 |
+
|
280 |
def MATH(responses: list[str], answers: list[str]):
|
281 |
extract_responses = sync_pipe(get_answer)(responses)
|
282 |
extract_answers = sync_pipe(get_answer)(answers)
|
|
|
301 |
for choice in list("ABCD"):
|
302 |
prompt += f"\n{choice}. {example[choice]}"
|
303 |
|
304 |
+
prompt += "\n答案:"
|
305 |
return {"prompt": prompt}
|
306 |
|
307 |
subcategories = {
|
|
|
816 |
return suite
|
817 |
|
818 |
|
819 |
+
class BoolQ:
|
820 |
+
input_column = "input"
|
821 |
+
label_column = "answer"
|
822 |
+
|
823 |
+
@classmethod
|
824 |
+
def prompt_boolq(cls, example, chat=False):
|
825 |
+
|
826 |
+
prompt = f"{example['passage']}\nQuestion: {example['question']}\nA. Yes\nB. No\nAnswer: "
|
827 |
+
|
828 |
+
return {"input": prompt}
|
829 |
+
|
830 |
+
@classmethod
|
831 |
+
def suite(cls, chat: bool):
|
832 |
+
|
833 |
+
suite = [
|
834 |
+
Task(
|
835 |
+
dataset_name="boolq",
|
836 |
+
metric_name=("sustech/tlem", "boolq"),
|
837 |
+
input_column=cls.input_column,
|
838 |
+
label_column=cls.label_column,
|
839 |
+
prompt=partial(cls.prompt_boolq, chat=chat),
|
840 |
+
few_shot=0 if chat else 5,
|
841 |
+
few_shot_from="train",
|
842 |
+
split="validation",
|
843 |
+
)
|
844 |
+
]
|
845 |
+
|
846 |
+
return suite
|
847 |
+
|
848 |
+
class TruthfulQAMC1:
|
849 |
+
input_column = "input"
|
850 |
+
label_column = "answer"
|
851 |
+
|
852 |
+
@classmethod
|
853 |
+
def prompt_truthful_qa(cls, example):
|
854 |
+
|
855 |
+
target = example["mc1_targets"]
|
856 |
+
choices = target["choices"]
|
857 |
+
labels = target["labels"]
|
858 |
+
|
859 |
+
prompt = f"The following is a multiple-choice question. Please choose the most suitable one as the answer to this question.\n\n"
|
860 |
+
prompt += example["question"]
|
861 |
+
|
862 |
+
answer = []
|
863 |
+
|
864 |
+
for idx, choice, label in zip(list("ABCDEFGHIJ")[:len(choices)], choices, labels):
|
865 |
+
|
866 |
+
prompt += f"\n{idx}. {choice}"
|
867 |
+
|
868 |
+
if label == 1:
|
869 |
+
answer = idx
|
870 |
+
|
871 |
+
prompt += "\nAnswer: "
|
872 |
+
|
873 |
+
return {
|
874 |
+
"input": prompt,
|
875 |
+
"answer": answer
|
876 |
+
}
|
877 |
+
|
878 |
+
@classmethod
|
879 |
+
def suite(cls):
|
880 |
+
suite = [
|
881 |
+
Task(
|
882 |
+
dataset_name=("truthful_qa", "multiple_choice"),
|
883 |
+
metric_name=("sustech/tlem", "truthful_qa_mc1"),
|
884 |
+
input_column=cls.input_column,
|
885 |
+
label_column=cls.label_column,
|
886 |
+
prompt=partial(cls.prompt_truthful_qa),
|
887 |
+
few_shot=0,
|
888 |
+
split="validation",
|
889 |
+
)
|
890 |
+
]
|
891 |
+
|
892 |
+
return suite
|
893 |
+
|
894 |
class CEVAL:
|
895 |
input_column = "input"
|
896 |
label_column = "answer"
|
tlem.py
CHANGED
@@ -151,7 +151,10 @@ class Suite(EvaluationSuite):
|
|
151 |
suite = DROP.suite()
|
152 |
case "winogrande":
|
153 |
suite = Winogrande.suite()
|
154 |
-
|
|
|
|
|
|
|
155 |
case "mt_bench":
|
156 |
suite = Task(
|
157 |
dataset_name="SUSTech/mt_bench_judge",
|
|
|
151 |
suite = DROP.suite()
|
152 |
case "winogrande":
|
153 |
suite = Winogrande.suite()
|
154 |
+
case "truthfulqa_mc1":
|
155 |
+
suite = TruthfulQAMC1.suite()
|
156 |
+
case _ if name.startswith("boolq"):
|
157 |
+
suite = BoolQ.suite(chat=chat)
|
158 |
case "mt_bench":
|
159 |
suite = Task(
|
160 |
dataset_name="SUSTech/mt_bench_judge",
|
utils.py
CHANGED
@@ -74,27 +74,27 @@ def extract_choice_zh(gen):
|
|
74 |
def extract_choice(gen):
|
75 |
# answer is A | choice is A | choose A
|
76 |
res = re.search(
|
77 |
-
r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^
|
78 |
gen,
|
79 |
)
|
80 |
|
81 |
# A is correct | A is right
|
82 |
if res is None:
|
83 |
res = re.search(
|
84 |
-
r"\b(A|B|C|D)\b(?![^
|
85 |
gen,
|
86 |
)
|
87 |
|
88 |
# straight answer: A
|
89 |
if res is None:
|
90 |
-
res = re.search(r"^(A|B|C|D)(?:\.|,|:|$)", gen)
|
91 |
|
92 |
# simply extract the first appearred letter
|
93 |
if res is None:
|
94 |
-
res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
|
95 |
|
96 |
if res is None:
|
97 |
-
res = "
|
98 |
|
99 |
if isinstance(res, str):
|
100 |
return res
|
|
|
74 |
def extract_choice(gen):
|
75 |
# answer is A | choice is A | choose A
|
76 |
res = re.search(
|
77 |
+
r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCDEFGHIJKL]{0,20}?(?:n't|not))[^ABCDEFGHIJKL]{0,10}?\b(?:|is|:|be))\b)[^ABCDEFGHIJKL]{0,20}?\b(A|B|C|D|E|F|G|H|I|J|K|L)\b",
|
78 |
gen,
|
79 |
)
|
80 |
|
81 |
# A is correct | A is right
|
82 |
if res is None:
|
83 |
res = re.search(
|
84 |
+
r"\b(A|B|C|D|E|F|G|H|I|J|K|L)\b(?![^ABCDEFGHIJKL]{0,8}?(?:n't|not)[^ABCDEFGHIJKL]{0,5}?(?:correct|right))[^ABCDEFGHIJKL]{0,10}?\b(?:correct|right)\b",
|
85 |
gen,
|
86 |
)
|
87 |
|
88 |
# straight answer: A
|
89 |
if res is None:
|
90 |
+
res = re.search(r"^(A|B|C|D|E|F|G|H|I|J|K|L)(?:\.|,|:|$)", gen)
|
91 |
|
92 |
# simply extract the first appearred letter
|
93 |
if res is None:
|
94 |
+
res = re.search(r"(?<![a-zA-Z])(A|B|C|D|E|F|G|H|I|J|K|L)(?![a-zA-Z=])", gen)
|
95 |
|
96 |
if res is None:
|
97 |
+
res = "L"
|
98 |
|
99 |
if isinstance(res, str):
|
100 |
return res
|