pminervini commited on
Commit
73d1e6e
1 Parent(s): b2aa5d0
cli/fix-requests-cli.py CHANGED
@@ -41,7 +41,7 @@ for path in json_files:
41
  data["model_type"] = "fine-tuned"
42
  to_overwrite = True
43
 
44
- is_instruction_tuned = 'nstruct' in model_id
45
  if is_instruction_tuned:
46
  data["model_type"] = "instruction-tuned"
47
  to_overwrite = True
 
41
  data["model_type"] = "fine-tuned"
42
  to_overwrite = True
43
 
44
+ is_instruction_tuned = ('nstruct' in model_id) or ('chat' in model_id)
45
  if is_instruction_tuned:
46
  data["model_type"] = "instruction-tuned"
47
  to_overwrite = True
cli/halueval-cli.py CHANGED
@@ -7,6 +7,8 @@ from src.backend.manage_requests import get_eval_requests
7
  from src.backend.manage_requests import EvalRequest
8
  from src.backend.run_eval_suite import run_evaluation
9
 
 
 
10
  from lm_eval.tasks import initialize_tasks, include_task_folder
11
  from lm_eval import tasks, evaluator, utils
12
 
@@ -15,7 +17,7 @@ from src.envs import QUEUE_REPO
15
 
16
 
17
  def main():
18
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
19
 
20
  PENDING_STATUS = "PENDING"
21
  RUNNING_STATUS = "RUNNING"
@@ -28,7 +30,10 @@ def main():
28
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
29
  eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
30
 
31
- TASKS_HARNESS = [t.value for t in Tasks if 'xsum' in t.value.benchmark]
 
 
 
32
  # task_names = ['triviaqa']
33
  # TASKS_HARNESS = [task.value for task in Tasks]
34
 
 
7
  from src.backend.manage_requests import EvalRequest
8
  from src.backend.run_eval_suite import run_evaluation
9
 
10
+ from src.backend.tasks.xsum.task import XSum
11
+
12
  from lm_eval.tasks import initialize_tasks, include_task_folder
13
  from lm_eval import tasks, evaluator, utils
14
 
 
17
 
18
 
19
  def main():
20
+ # snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
21
 
22
  PENDING_STATUS = "PENDING"
23
  RUNNING_STATUS = "RUNNING"
 
30
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
31
  eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
32
 
33
+ # my_task = Task("memo-trap", "acc", "memo-trap", 0)
34
+ my_task = Task("xsum", "rougeLsum", "XSum", 2)
35
+
36
+ TASKS_HARNESS = [my_task]
37
  # task_names = ['triviaqa']
38
  # TASKS_HARNESS = [task.value for task in Tasks]
39
 
src/backend/tasks/xsum/xsum.yaml.bak → snippets/xsum.yaml RENAMED
File without changes
src/backend/envs.py CHANGED
@@ -23,12 +23,18 @@ class Tasks(Enum):
23
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
24
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
25
  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
 
26
  # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
27
  task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
28
  task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
 
29
  task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
 
 
 
30
  # task6 = Task("xsum", "rougeL_acc", "XSum", 8)
31
- task6 = Task("memo-trap", "acc", "memo-trap", 0)
 
32
 
33
  # NUM_FEWSHOT = 64 # Change with your few shot
34
 
 
23
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
24
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
25
  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
26
+
27
  # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
28
  task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
29
  task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
30
+
31
  task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
32
+ # task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
33
+ # task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
34
+
35
  # task6 = Task("xsum", "rougeL_acc", "XSum", 8)
36
+
37
+ task8 = Task("memo-trap", "acc", "memo-trap", 0)
38
 
39
  # NUM_FEWSHOT = 64 # Change with your few shot
40