Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

pminervini commited on Dec 13, 2023

Commit

bcdca08

•

1 Parent(s): f41876f

update

Files changed (2) hide show

cli/halueval-cli.py CHANGED Viewed

@@ -42,7 +42,7 @@ def main():
     for task in TASKS_HARNESS:
         print(f"Selected Tasks: [{task}]")
         results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=1,
-                                            batch_size=1, device="mps", use_cache=None, limit=1, write_out=True)
         print('AAA', results["results"])
         breakpoint()

     for task in TASKS_HARNESS:
         print(f"Selected Tasks: [{task}]")
         results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=1,
+                                            batch_size=1, device="mps", use_cache=None, limit=10, write_out=True)
         print('AAA', results["results"])
         breakpoint()

src/backend/envs.py CHANGED Viewed

@@ -22,9 +22,12 @@ class Tasks(Enum):
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
     task0 = Task("nq_open", "em", "NQ Open", 64)  # 64, as in the ATLAS paper
     task1 = Task("triviaqa", "em", "TriviaQA", 64)  # 64, as in the ATLAS paper
-    task2 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
-    task3 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
-    task4 = Task("halueval_qa", "acc", "HaluEval QA", 0)
 # NUM_FEWSHOT = 64  # Change with your few shot

     # task1 = Task("logiqa", "acc_norm", "LogiQA")
     task0 = Task("nq_open", "em", "NQ Open", 64)  # 64, as in the ATLAS paper
     task1 = Task("triviaqa", "em", "TriviaQA", 64)  # 64, as in the ATLAS paper
+    # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
+    # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
+    task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
+    task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
+    task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
+    # task6 = Task("xsum", "rougeL_acc", "XSum", 8)
 # NUM_FEWSHOT = 64  # Change with your few shot