pminervini commited on
Commit
40ac231
1 Parent(s): 4e10b3e
fix-requests-cli.py CHANGED
@@ -41,7 +41,7 @@ for path in json_files:
41
  data["model_type"] = "fine-tuned"
42
  to_overwrite = True
43
 
44
- is_instruction_tuned = 'instruct' in model_id
45
  if is_instruction_tuned:
46
  data["model_type"] = "instruction-tuned"
47
  to_overwrite = True
 
41
  data["model_type"] = "fine-tuned"
42
  to_overwrite = True
43
 
44
+ is_instruction_tuned = 'nstruct' in model_id
45
  if is_instruction_tuned:
46
  data["model_type"] = "instruction-tuned"
47
  to_overwrite = True
src/display/utils.py CHANGED
@@ -27,6 +27,7 @@ class Tasks(Enum):
27
  triviaqa = Task("triviaqa", "em", "TriviaQA")
28
  truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
29
  truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
 
30
  #truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
31
  #truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
32
 
 
27
  triviaqa = Task("triviaqa", "em", "TriviaQA")
28
  truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
29
  truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
30
+ halueval_qa = Task("halueval_qa", "em", "HaluEval QA")
31
  #truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1")
32
  #truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2")
33
 
src/leaderboard/read_evals.py CHANGED
@@ -1,6 +1,5 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
@@ -32,7 +31,7 @@ class EvalResult:
32
  still_on_hub: bool = False
33
 
34
  @staticmethod
35
- def init_from_json_file(json_filepath):
36
  """Inits the result from the specific model result file"""
37
  with open(json_filepath) as fp:
38
  data = json.load(fp)
@@ -67,79 +66,13 @@ class EvalResult:
67
 
68
  # Extract results available in this file (some results are split in several files)
69
  results = {}
70
- for task in Tasks:
71
- task = task.value
72
-
73
- def post_process_results(results: dict) -> dict:
74
- # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
75
- res_copy = results.copy()
76
-
77
- for task_name in res_copy.keys():
78
- entry_copy = results[task_name].copy()
79
 
80
- for k, v in entry_copy.items():
81
- if "exact_match" in k:
82
- results[task_name][k.replace("exact_match", "em")] = v
83
-
84
- entry_copy = results[task_name].copy()
85
 
86
- for k, v in entry_copy.items():
87
- if "," in k:
88
- tokens = k.split(",")
89
- results[task_name][tokens[0]] = v
90
-
91
- return results
92
-
93
- accs = np.array([v.get(task.metric, None) for k, v in post_process_results(data["results"]).items() if task.benchmark in k])
94
-
95
- if accs.size == 0 or any([acc is None for acc in accs]):
96
- continue
97
-
98
- mean_acc = np.mean(accs) * 100.0
99
- results[task.benchmark] = mean_acc
100
-
101
- return EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
102
- precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
103
- architecture=architecture)
104
-
105
- @staticmethod
106
- def init_from_json_file_backend(json_filepath):
107
- """Inits the result from the specific model result file"""
108
- with open(json_filepath) as fp:
109
- data = json.load(fp)
110
-
111
- # We manage the legacy config format
112
- config = data.get("config", data.get("config_general", None))
113
-
114
- # Precision
115
- precision = Precision.from_str(config.get("model_dtype"))
116
-
117
- # Get model and org
118
- org_and_model = config.get("model_name", config.get("model_args", None))
119
- org_and_model = org_and_model.split("/", 1)
120
-
121
- if len(org_and_model) == 1:
122
- org = None
123
- model = org_and_model[0]
124
- result_key = f"{model}_{precision.value.name}"
125
- else:
126
- org = org_and_model[0]
127
- model = org_and_model[1]
128
- result_key = f"{org}_{model}_{precision.value.name}"
129
- full_model = "/".join(org_and_model)
130
-
131
- still_on_hub, error, model_config = \
132
- is_model_on_hub(full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False)
133
- architecture = "?"
134
- if model_config is not None:
135
- architectures = getattr(model_config, "architectures", None)
136
- if architectures:
137
- architecture = ";".join(architectures)
138
-
139
- # Extract results available in this file (some results are split in several files)
140
- results = {}
141
- from src.backend.envs import Tasks as BackendTasks
142
- for task in BackendTasks:
143
  task = task.value
144
 
145
  def post_process_results(results: dict) -> dict:
@@ -267,10 +200,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
267
  eval_results = {}
268
  for model_result_filepath in model_result_filepaths:
269
  # Creation of result
270
- if is_backend:
271
- eval_result = EvalResult.init_from_json_file_backend(model_result_filepath)
272
- else:
273
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
274
  eval_result.update_with_request_file(requests_path)
275
 
276
  # Store results of same eval together
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
 
 
31
  still_on_hub: bool = False
32
 
33
  @staticmethod
34
+ def init_from_json_file(json_filepath, is_backend: bool = False):
35
  """Inits the result from the specific model result file"""
36
  with open(json_filepath) as fp:
37
  data = json.load(fp)
 
66
 
67
  # Extract results available in this file (some results are split in several files)
68
  results = {}
 
 
 
 
 
 
 
 
 
69
 
70
+ task_iterator = Tasks
71
+ if is_backend is True:
72
+ from src.backend.envs import Tasks as BackendTasks
73
+ task_iterator = BackendTasks
 
74
 
75
+ for task in task_iterator:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  task = task.value
77
 
78
  def post_process_results(results: dict) -> dict:
 
200
  eval_results = {}
201
  for model_result_filepath in model_result_filepaths:
202
  # Creation of result
203
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
 
 
 
204
  eval_result.update_with_request_file(requests_path)
205
 
206
  # Store results of same eval together
submit-cli.py CHANGED
@@ -2,12 +2,18 @@
2
 
3
  import json
4
  import os
 
5
 
6
  from datetime import datetime, timezone
7
 
8
  from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
9
  from src.submission.check_validity import already_submitted_models, get_model_size, is_model_on_hub
10
 
 
 
 
 
 
11
 
12
  def add_new_eval(model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str):
13
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
@@ -118,32 +124,40 @@ def main():
118
 
119
  filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
120
 
121
- for i in range(min(200, len(filtered_model_lst))):
122
- model = filtered_model_lst[i]
123
 
124
- print(f'Considering {model.id} ..')
 
 
 
 
 
125
 
126
- from huggingface_hub import snapshot_download
127
- from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND
128
- from src.backend.manage_requests import get_eval_requests
129
- from src.backend.manage_requests import EvalRequest
130
 
131
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
132
 
133
- PENDING_STATUS = "PENDING"
134
- RUNNING_STATUS = "RUNNING"
135
- FINISHED_STATUS = "FINISHED"
136
- FAILED_STATUS = "FAILED"
137
 
138
- status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
139
 
140
- # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
141
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
142
 
143
- requested_model_names = {e.model for e in eval_requests}
 
 
 
 
 
 
144
 
145
  if model.id not in requested_model_names:
146
- add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type='pretrained')
 
 
 
147
  else:
148
  print(f'Model {model.id} already added, not adding it to the queue again.')
149
 
 
2
 
3
  import json
4
  import os
5
+ import time
6
 
7
  from datetime import datetime, timezone
8
 
9
  from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
10
  from src.submission.check_validity import already_submitted_models, get_model_size, is_model_on_hub
11
 
12
+ from huggingface_hub import snapshot_download
13
+ from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND
14
+ from src.backend.manage_requests import get_eval_requests
15
+ from src.backend.manage_requests import EvalRequest
16
+
17
 
18
  def add_new_eval(model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str):
19
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
 
124
 
125
  filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
126
 
127
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
128
 
129
+ PENDING_STATUS = "PENDING"
130
+ RUNNING_STATUS = "RUNNING"
131
+ FINISHED_STATUS = "FINISHED"
132
+ FAILED_STATUS = "FAILED"
133
+
134
+ status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
135
 
136
+ # Get all eval requests
137
+ eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
138
 
139
+ requested_model_names = {e.model for e in eval_requests}
140
 
141
+ for i in range(min(200, len(filtered_model_lst))):
142
+ model = filtered_model_lst[i]
 
 
143
 
144
+ print(f'Considering {model.id} ..')
145
 
146
+ is_finetuned = any(tag.startswith('base_model:') for tag in model.tags)
 
147
 
148
+ model_type = 'pretrained'
149
+ if is_finetuned:
150
+ model_type = "fine-tuned"
151
+
152
+ is_instruction_tuned = 'nstruct' in model.id
153
+ if is_instruction_tuned:
154
+ model_type = "instruction-tuned"
155
 
156
  if model.id not in requested_model_names:
157
+
158
+ if 'mage' not in model.id:
159
+ add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
160
+ time.sleep(60)
161
  else:
162
  print(f'Model {model.id} already added, not adding it to the queue again.')
163