Spaces:

SUSTech
/

tlem

Running

App Files Files Community

ADD: BoolQ, TurthfulQA

by Cookize - opened Dec 14, 2023

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+93

-7

Files changed (3) hide show

tasks.py +84 -1
tlem.py +4 -1
utils.py +5 -5

tasks.py CHANGED Viewed

@@ -209,6 +209,7 @@ def multichoice_zh(responses: Any, references: list[str]):
 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
     ceval = multichoice_zh
     def winogrande(responses: list[str], answers: list[str | int]):
@@ -269,6 +270,13 @@ class Metrics:
         return responses, answers
     def MATH(responses: list[str], answers: list[str]):
         extract_responses = sync_pipe(get_answer)(responses)
         extract_answers = sync_pipe(get_answer)(answers)
@@ -293,7 +301,7 @@ class CMMLU:
         for choice in list("ABCD"):
             prompt += f"\n{choice}. {example[choice]}"
-            prompt += "\n答案："
         return {"prompt": prompt}
     subcategories = {
@@ -808,6 +816,81 @@ class BBH:
         return suite
 class CEVAL:
     input_column = "input"
     label_column = "answer"

 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
+    truthful_qa_mc1 = multichoice
     ceval = multichoice_zh
     def winogrande(responses: list[str], answers: list[str | int]):
         return responses, answers
+    def boolq(responses: list[str], answers: list[str | int]):
+        responses = [first_capital_postprocess(response) for response in responses]
+        answers = ["A" if answer else "B" for answer in answers]
+        return responses, answers
     def MATH(responses: list[str], answers: list[str]):
         extract_responses = sync_pipe(get_answer)(responses)
         extract_answers = sync_pipe(get_answer)(answers)
         for choice in list("ABCD"):
             prompt += f"\n{choice}. {example[choice]}"
+        prompt += "\n答案："
         return {"prompt": prompt}
     subcategories = {
         return suite
+class BoolQ:
+    input_column = "input"
+    label_column = "answer"
+    @classmethod
+    def prompt_boolq(cls, example, chat=False):
+        prompt = f"{example['passage']}\nQuestion: {example['question']}\nA. Yes\nB. No\nAnswer: "
+        return {"input": prompt}
+    @classmethod
+    def suite(cls, chat: bool):
+        suite = [
+            Task(
+                dataset_name="boolq",
+                metric_name=("sustech/tlem", "boolq"),
+                input_column=cls.input_column,
+                label_column=cls.label_column,
+                prompt=partial(cls.prompt_boolq, chat=chat),
+                few_shot=0 if chat else 5,
+                few_shot_from="train",
+                split="validation",
+            )
+        ]
+        return suite
+class TruthfulQAMC1:
+    input_column = "input"
+    label_column = "answer"
+    @classmethod
+    def prompt_truthful_qa(cls, example):
+        target = example["mc1_targets"]
+        choices = target["choices"]
+        labels = target["labels"]
+        prompt = f"The following is a multiple-choice question. Please choose the most suitable one as the answer to this question.\n\n"
+        prompt += example["question"]
+        answer = []
+        for idx, choice, label in zip(list("ABCDEFGHIJ")[:len(choices)], choices, labels):
+            prompt += f"\n{idx}. {choice}"
+            if label == 1:
+                answer = idx
+        prompt += "\nAnswer: "
+        return {
+            "input": prompt,
+            "answer": answer
+        }
+    @classmethod
+    def suite(cls):
+        suite = [
+            Task(
+                dataset_name=("truthful_qa", "multiple_choice"),
+                metric_name=("sustech/tlem", "truthful_qa_mc1"),
+                input_column=cls.input_column,
+                label_column=cls.label_column,
+                prompt=partial(cls.prompt_truthful_qa),
+                few_shot=0,
+                split="validation",
+            )
+        ]
+        return suite
 class CEVAL:
     input_column = "input"
     label_column = "answer"

tlem.py CHANGED Viewed

@@ -151,7 +151,10 @@ class Suite(EvaluationSuite):
                 suite = DROP.suite()
             case "winogrande":
                 suite = Winogrande.suite()
             case "mt_bench":
                 suite = Task(
                     dataset_name="SUSTech/mt_bench_judge",

                 suite = DROP.suite()
             case "winogrande":
                 suite = Winogrande.suite()
+            case "truthfulqa_mc1":
+                suite = TruthfulQAMC1.suite()
+            case _ if name.startswith("boolq"):
+                suite = BoolQ.suite(chat=chat)
             case "mt_bench":
                 suite = Task(
                     dataset_name="SUSTech/mt_bench_judge",

utils.py CHANGED Viewed

@@ -74,27 +74,27 @@ def extract_choice_zh(gen):
 def extract_choice(gen):
     # answer is A | choice is A | choose A
     res = re.search(
-        r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b",
         gen,
     )
     # A is correct | A is right
     if res is None:
         res = re.search(
-            r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b",
             gen,
         )
     # straight answer: A
     if res is None:
-        res = re.search(r"^(A|B|C|D)(?:\.|,|:|$)", gen)
     # simply extract the first appearred letter
     if res is None:
-        res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
     if res is None:
-        res = "A"
     if isinstance(res, str):
         return res

 def extract_choice(gen):
     # answer is A | choice is A | choose A
     res = re.search(
+        r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCDEFGHIJKL]{0,20}?(?:n't|not))[^ABCDEFGHIJKL]{0,10}?\b(?:|is|:|be))\b)[^ABCDEFGHIJKL]{0,20}?\b(A|B|C|D|E|F|G|H|I|J|K|L)\b",
         gen,
     )
     # A is correct | A is right
     if res is None:
         res = re.search(
+            r"\b(A|B|C|D|E|F|G|H|I|J|K|L)\b(?![^ABCDEFGHIJKL]{0,8}?(?:n't|not)[^ABCDEFGHIJKL]{0,5}?(?:correct|right))[^ABCDEFGHIJKL]{0,10}?\b(?:correct|right)\b",
             gen,
         )
     # straight answer: A
     if res is None:
+        res = re.search(r"^(A|B|C|D|E|F|G|H|I|J|K|L)(?:\.|,|:|$)", gen)
     # simply extract the first appearred letter
     if res is None:
+        res = re.search(r"(?<![a-zA-Z])(A|B|C|D|E|F|G|H|I|J|K|L)(?![a-zA-Z=])", gen)
     if res is None:
+        res = "L"
     if isinstance(res, str):
         return res