Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

pminervini commited on Dec 27, 2023

Commit

3d44a49

•

1 Parent(s): 21eac98

update

Files changed (4) hide show

cli/eval-cli.py CHANGED Viewed

@@ -35,7 +35,8 @@ def main():
     # my_task = Task("memo-trap", "acc", "memo-trap", 0)
     # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
     # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
-    my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
     eval_logger = utils.eval_logger
     import logging

     # my_task = Task("memo-trap", "acc", "memo-trap", 0)
     # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
     # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
+    # my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
+    my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
     eval_logger = utils.eval_logger
     import logging

src/backend/tasks/faithdial/faithdial.yaml ADDED Viewed

+group: faithdial
+task: faithdial_hallu
+dataset_path: McGill-NLP/FaithDial
+training_split: train
+validation_split: validation
+test_split: test
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+# process_results: !function utils.process_results
+doc_to_choice: ["false", "true"]
+metric_list:
+  - metric: acc
+    higher_is_better: True
+metadata:
+  version: 0.0

src/backend/tasks/faithdial/utils.py ADDED Viewed

+from typing import List, Union
+ValueType = Union[str, List[str]]
+def doc_to_text(doc: dict[str, ValueType]) -> str:
+    history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc['history'])])
+    doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
+    # breakpoint()
+    return doc_text
+def doc_to_target(doc: dict[str, ValueType]) -> str:
+    res = "true" if "Hallucination" in doc["BEGIN"] else "false"
+    # breakpoint()
+    return res
+def process_results(doc: dict[str, ValueType], results: List[str]) -> dict[str, float]:
+    # breakpoint()
+    return {"acc": 0.0}

src/backend/tasks/halueval/utils.py CHANGED Viewed

@@ -83,13 +83,13 @@ You should try your best to determine if the summary contains non-factual or hal
 def doc_to_text_qa(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
-    doc_text = QA_INSTURCTIONS + "\n\n#Knowledge: " + doc["knowledge"] + "\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
     return doc_text
 def doc_to_text_dialogue(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
-    doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge: " + doc["knowledge"] + "\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
     return doc_text
@@ -127,7 +127,7 @@ def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
     return res
-def process_results(doc: dict[str, str], results: list[str]):
     # results is e.g., ['Yes']
     gold_list = doc_to_target(doc)
     # gold_list is e.g., 'yes'

 def doc_to_text_qa(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
+    doc_text = QA_INSTURCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
     return doc_text
 def doc_to_text_dialogue(doc: dict[str, str]) -> str:
     # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
+    doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
     return doc_text
     return res
+def process_results(doc: dict[str, str], results: list[str]) -> dict[str, float]:
     # results is e.g., ['Yes']
     gold_list = doc_to_target(doc)
     # gold_list is e.g., 'yes'