pminervini commited on
Commit
b79c971
1 Parent(s): 950ba14
halueval-cli.py CHANGED
@@ -28,20 +28,22 @@ def main():
28
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
29
  eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
30
 
31
- task_names = ['halueval_qa']
 
 
32
 
33
  include_task_folder("src/backend/tasks/")
34
  initialize_tasks('INFO')
35
 
36
  print(tasks.ALL_TASKS)
37
 
38
- task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
39
-
40
- for task_name in task_names:
41
- print(f"Selected Tasks: [{task_name}]")
42
- results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=[task_name], num_fewshot=0,
43
  batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
44
- print('AAA', results)
 
 
45
 
46
 
47
  if __name__ == "__main__":
 
28
  eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
29
  eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
30
 
31
+ # task_names = ['halueval_qa']
32
+ # task_names = ['triviaqa']
33
+ TASKS_HARNESS = [task.value for task in Tasks]
34
 
35
  include_task_folder("src/backend/tasks/")
36
  initialize_tasks('INFO')
37
 
38
  print(tasks.ALL_TASKS)
39
 
40
+ for task in TASKS_HARNESS:
41
+ print(f"Selected Tasks: [{task}]")
42
+ results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=0,
 
 
43
  batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
44
+ print('AAA', results["results"])
45
+
46
+ # breakpoint()
47
 
48
 
49
  if __name__ == "__main__":
src/backend/tasks/halueval/halueval_dialogue.yaml CHANGED
@@ -4,23 +4,11 @@ dataset_name: dialogue_samples
4
  output_type: generate_until
5
  training_split: data
6
  validation_split: data
 
 
7
  doc_to_text: !function utils.doc_to_text_dialogue
8
  doc_to_target: !function utils.doc_to_target_qa
9
  process_results: !function utils.process_results_qa
10
- fewshot_delimiter: "\n"
11
- generation_kwargs:
12
- until:
13
- - "\n"
14
- - "."
15
- - ","
16
- do_sample: false
17
- temperature: 0.0
18
- filter_list:
19
- - name: remove_whitespace
20
- filter:
21
- - function: remove_whitespace
22
- - function: take_first
23
- target_delimiter: " "
24
  metric_list:
25
  - metric: em
26
  aggregation: mean
 
4
  output_type: generate_until
5
  training_split: data
6
  validation_split: data
7
+ test_split: data
8
+ num_fewshot: 0
9
  doc_to_text: !function utils.doc_to_text_dialogue
10
  doc_to_target: !function utils.doc_to_target_qa
11
  process_results: !function utils.process_results_qa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  metric_list:
13
  - metric: em
14
  aggregation: mean
src/backend/tasks/halueval/halueval_qa.yaml CHANGED
@@ -4,23 +4,11 @@ dataset_name: qa_samples
4
  output_type: generate_until
5
  training_split: data
6
  validation_split: data
 
 
7
  doc_to_text: !function utils.doc_to_text_qa
8
  doc_to_target: !function utils.doc_to_target_qa
9
  process_results: !function utils.process_results_qa
10
- fewshot_delimiter: "\n"
11
- generation_kwargs:
12
- until:
13
- - "\n"
14
- - "."
15
- - ","
16
- do_sample: false
17
- temperature: 0.0
18
- filter_list:
19
- - name: remove_whitespace
20
- filter:
21
- - function: remove_whitespace
22
- - function: take_first
23
- target_delimiter: " "
24
  metric_list:
25
  - metric: em
26
  aggregation: mean
 
4
  output_type: generate_until
5
  training_split: data
6
  validation_split: data
7
+ test_split: data
8
+ num_fewshot: 0
9
  doc_to_text: !function utils.doc_to_text_qa
10
  doc_to_target: !function utils.doc_to_target_qa
11
  process_results: !function utils.process_results_qa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  metric_list:
13
  - metric: em
14
  aggregation: mean
src/backend/tasks/halueval/halueval_summarization.yaml CHANGED
@@ -4,23 +4,11 @@ dataset_name: summarization_samples
4
  output_type: generate_until
5
  training_split: data
6
  validation_split: data
 
 
7
  doc_to_text: !function utils.doc_to_text_summarization
8
  doc_to_target: !function utils.doc_to_target_qa
9
  process_results: !function utils.process_results_qa
10
- fewshot_delimiter: "\n"
11
- generation_kwargs:
12
- until:
13
- - "\n"
14
- - "."
15
- - ","
16
- do_sample: false
17
- temperature: 0.0
18
- filter_list:
19
- - name: remove_whitespace
20
- filter:
21
- - function: remove_whitespace
22
- - function: take_first
23
- target_delimiter: " "
24
  metric_list:
25
  - metric: em
26
  aggregation: mean
 
4
  output_type: generate_until
5
  training_split: data
6
  validation_split: data
7
+ test_split: data
8
+ num_fewshot: 0
9
  doc_to_text: !function utils.doc_to_text_summarization
10
  doc_to_target: !function utils.doc_to_target_qa
11
  process_results: !function utils.process_results_qa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  metric_list:
13
  - metric: em
14
  aggregation: mean
src/backend/tasks/halueval/utils.py CHANGED
@@ -116,7 +116,7 @@ def compute_metrics_qa(gold_answer: str, prediction: str) -> dict[str, float]:
116
  elif "No" in prediction:
117
  prediction = "no"
118
 
119
- is_exact = (gold_answer == prediction)
120
 
121
  res = {"correctness": 1.0 if is_correct else 0.0}
122
  if is_correct:
 
116
  elif "No" in prediction:
117
  prediction = "no"
118
 
119
+ is_exact = gold_answer == prediction
120
 
121
  res = {"correctness": 1.0 if is_correct else 0.0}
122
  if is_correct:
src/leaderboard/read_evals.py CHANGED
@@ -86,7 +86,17 @@ class EvalResult:
86
  continue
87
 
88
  # We average all scores of a given metric (mostly for mmlu)
89
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
 
 
 
 
 
 
 
 
 
 
90
  if accs.size == 0 or any([acc is None for acc in accs]):
91
  continue
92
 
 
86
  continue
87
 
88
  # We average all scores of a given metric (mostly for mmlu)
89
+
90
+ def post_process_results(results: dict) -> dict:
91
+ res_copy = results.copy()
92
+ for k, v in res_copy.items():
93
+ if "," in k:
94
+ tokens = k.split(",")
95
+ results[tokens[0]] = v
96
+ return results
97
+
98
+ accs = np.array([v.get(task.metric, None) for k, v in post_process_results(data["results"]).items() if task.benchmark in k])
99
+
100
  if accs.size == 0 or any([acc is None for acc in accs]):
101
  continue
102