pminervini commited on
Commit
bcdca08
1 Parent(s): f41876f
Files changed (2) hide show
  1. cli/halueval-cli.py +1 -1
  2. src/backend/envs.py +6 -3
cli/halueval-cli.py CHANGED
@@ -42,7 +42,7 @@ def main():
42
  for task in TASKS_HARNESS:
43
  print(f"Selected Tasks: [{task}]")
44
  results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=1,
45
- batch_size=1, device="mps", use_cache=None, limit=1, write_out=True)
46
  print('AAA', results["results"])
47
 
48
  breakpoint()
 
42
  for task in TASKS_HARNESS:
43
  print(f"Selected Tasks: [{task}]")
44
  results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=1,
45
+ batch_size=1, device="mps", use_cache=None, limit=10, write_out=True)
46
  print('AAA', results["results"])
47
 
48
  breakpoint()
src/backend/envs.py CHANGED
@@ -22,9 +22,12 @@ class Tasks(Enum):
22
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
23
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
24
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
25
- task2 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
26
- task3 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0) # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
27
- task4 = Task("halueval_qa", "acc", "HaluEval QA", 0)
 
 
 
28
 
29
  # NUM_FEWSHOT = 64 # Change with your few shot
30
 
 
22
  # task1 = Task("logiqa", "acc_norm", "LogiQA")
23
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
24
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
25
+ # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
26
+ # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
27
+ task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
28
+ task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
29
+ task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
30
+ # task6 = Task("xsum", "rougeL_acc", "XSum", 8)
31
 
32
  # NUM_FEWSHOT = 64 # Change with your few shot
33