pminervini commited on
Commit
c639c51
1 Parent(s): 7644de5
backend-cli.py CHANGED
@@ -7,7 +7,7 @@ import random
7
  from datetime import datetime
8
 
9
  from src.backend.run_eval_suite import run_evaluation
10
- from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
11
  from src.backend.sort_queue import sort_models_by_priority
12
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
13
 
@@ -15,7 +15,7 @@ from src.backend.manage_requests import EvalRequest
15
  from src.leaderboard.read_evals import EvalResult
16
 
17
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
18
- from src.utils import my_snapshot_download
19
 
20
  import logging
21
  import pprint
@@ -136,14 +136,12 @@ def process_finished_requests(thr: int) -> bool:
136
  eval_request: EvalRequest = result_name_to_request[result_name]
137
 
138
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
139
- set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
140
- local_dir=EVAL_REQUESTS_PATH_BACKEND)
141
 
142
  results = process_evaluation(task, eval_request)
143
 
144
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
145
- set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
146
- local_dir=EVAL_REQUESTS_PATH_BACKEND)
147
 
148
  return True
149
 
@@ -171,8 +169,7 @@ def process_pending_requests() -> bool:
171
  pp.pprint(eval_request)
172
 
173
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
174
- set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
175
- local_dir=EVAL_REQUESTS_PATH_BACKEND)
176
 
177
  task_lst = TASKS_HARNESS.copy()
178
  random.shuffle(task_lst)
@@ -181,8 +178,7 @@ def process_pending_requests() -> bool:
181
  results = process_evaluation(task, eval_request)
182
 
183
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
184
- set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
185
- local_dir=EVAL_REQUESTS_PATH_BACKEND)
186
 
187
  return True
188
 
 
7
  from datetime import datetime
8
 
9
  from src.backend.run_eval_suite import run_evaluation
10
+ from src.backend.manage_requests import check_completed_evals, get_eval_requests
11
  from src.backend.sort_queue import sort_models_by_priority
12
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
13
 
 
15
  from src.leaderboard.read_evals import EvalResult
16
 
17
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
18
+ from src.utils import my_snapshot_download, my_set_eval_request
19
 
20
  import logging
21
  import pprint
 
136
  eval_request: EvalRequest = result_name_to_request[result_name]
137
 
138
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
139
+ my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
140
 
141
  results = process_evaluation(task, eval_request)
142
 
143
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
144
+ my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
145
 
146
  return True
147
 
 
169
  pp.pprint(eval_request)
170
 
171
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
172
+ my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
173
 
174
  task_lst = TASKS_HARNESS.copy()
175
  random.shuffle(task_lst)
 
178
  results = process_evaluation(task, eval_request)
179
 
180
  my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
181
+ my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
182
 
183
  return True
184
 
src/backend/envs.py CHANGED
@@ -20,6 +20,9 @@ class Tasks(Enum):
20
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
21
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
22
 
 
 
 
23
  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
24
  task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
25
  task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
@@ -38,9 +41,6 @@ class Tasks(Enum):
38
  task10 = Task("memo-trap", "acc", "memo-trap", 0)
39
  task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
40
 
41
- task11 = Task("nq8", "em", "NQ Open 8", 8)
42
- task12 = Task("tqa8", "em", "TriviaQA 8", 8)
43
-
44
  task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
45
 
46
  task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
 
20
  task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
21
  task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
22
 
23
+ task11 = Task("nq8", "em", "NQ Open 8", 8)
24
+ task12 = Task("tqa8", "em", "TriviaQA 8", 8)
25
+
26
  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
27
  task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
28
  task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
 
41
  task10 = Task("memo-trap", "acc", "memo-trap", 0)
42
  task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
43
 
 
 
 
44
  task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
45
 
46
  task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
src/backend/tasks/nq8/nq8.yaml CHANGED
@@ -3,7 +3,7 @@ dataset_path: nq_open
3
  output_type: generate_until
4
  training_split: train
5
  validation_split: validation
6
- description: "Answer these questions:\n"
7
  doc_to_text: "Q: {{question}}?\nA:"
8
  doc_to_target: "{{answer}}" # TODO: should be multi-target
9
  fewshot_delimiter: "\n"
@@ -27,6 +27,6 @@ metric_list:
27
  ignore_case: true
28
  ignore_punctuation: true
29
  regexes_to_ignore:
30
- - "\ban|a|the\b"
31
  metadata:
32
  - version: 0.0
 
3
  output_type: generate_until
4
  training_split: train
5
  validation_split: validation
6
+ description: "Answer these questions:\n\n"
7
  doc_to_text: "Q: {{question}}?\nA:"
8
  doc_to_target: "{{answer}}" # TODO: should be multi-target
9
  fewshot_delimiter: "\n"
 
27
  ignore_case: true
28
  ignore_punctuation: true
29
  regexes_to_ignore:
30
+ - "\\b(?:The |the |An |A |The |a |an )"
31
  metadata:
32
  - version: 0.0
src/utils.py CHANGED
@@ -1,4 +1,6 @@
1
  from huggingface_hub import snapshot_download
 
 
2
 
3
 
4
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
@@ -7,6 +9,15 @@ def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
7
  snapshot_download(repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers)
8
  return
9
  except Exception:
10
- import time
 
 
 
 
 
 
 
 
 
11
  time.sleep(60)
12
  return
 
1
  from huggingface_hub import snapshot_download
2
+ from src.backend.manage_requests import set_eval_request
3
+ import time
4
 
5
 
6
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
 
9
  snapshot_download(repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers)
10
  return
11
  except Exception:
12
+ time.sleep(60)
13
+ return
14
+
15
+
16
+ def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
17
+ for i in range(10):
18
+ try:
19
+ set_eval_request(api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir)
20
+ return
21
+ except Exception:
22
  time.sleep(60)
23
  return