pminervini commited on
Commit
669da77
1 Parent(s): e6299b2
app.py CHANGED
@@ -36,18 +36,16 @@ from src.submission.submit import add_new_eval
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
38
 
 
39
  try:
40
  print(EVAL_REQUESTS_PATH)
41
- snapshot_download(
42
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
43
- )
44
  except Exception:
45
  restart_space()
 
46
  try:
47
  print(EVAL_RESULTS_PATH)
48
- snapshot_download(
49
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
50
- )
51
  except Exception:
52
  restart_space()
53
 
@@ -58,23 +56,12 @@ leaderboard_df = original_df.copy()
58
 
59
  # plot_df = create_plot_df(create_scores_df(raw_data))
60
 
61
- (
62
- finished_eval_queue_df,
63
- running_eval_queue_df,
64
- pending_eval_queue_df,
65
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
66
 
67
 
68
  # Searching and filtering
69
- def update_table(
70
- hidden_df: pd.DataFrame,
71
- columns: list,
72
- type_query: list,
73
- precision_query: str,
74
- size_query: list,
75
- show_deleted: bool,
76
- query: str,
77
- ):
78
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
79
  filtered_df = filter_queries(query, filtered_df)
80
  df = select_columns(filtered_df, columns)
 
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
38
 
39
+
40
  try:
41
  print(EVAL_REQUESTS_PATH)
42
+ snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
43
  except Exception:
44
  restart_space()
45
+
46
  try:
47
  print(EVAL_RESULTS_PATH)
48
+ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
49
  except Exception:
50
  restart_space()
51
 
 
56
 
57
  # plot_df = create_plot_df(create_scores_df(raw_data))
58
 
59
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
60
 
61
 
62
  # Searching and filtering
63
+ def update_table(hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: str, size_query: list,
64
+ show_deleted: bool, query: str):
 
 
 
 
 
 
 
65
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
66
  filtered_df = filter_queries(query, filtered_df)
67
  df = select_columns(filtered_df, columns)
backend-cli.py CHANGED
@@ -8,15 +8,16 @@ from huggingface_hub import snapshot_download
8
  from src.backend.run_eval_suite import run_evaluation
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
- from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
 
 
 
12
 
13
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
14
 
15
  import logging
16
  import pprint
17
 
18
- # TASKS_HARNESS = [task.value.benchmark for task in Tasks]
19
-
20
  logging.getLogger("openai").setLevel(logging.WARNING)
21
 
22
  logging.basicConfig(level=logging.ERROR)
@@ -27,18 +28,102 @@ RUNNING_STATUS = "RUNNING"
27
  FINISHED_STATUS = "FINISHED"
28
  FAILED_STATUS = "FAILED"
29
 
 
 
30
  snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
31
  snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
32
 
33
 
34
- def run_auto_eval():
35
- current_pending_status = [PENDING_STATUS]
36
 
37
  # pull the eval dataset from the hub and parse any eval requests
38
  # check completed evals and set them to finished
39
  check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
40
  failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
41
  hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
44
  eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
@@ -48,7 +133,7 @@ def run_auto_eval():
48
  print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
49
 
50
  if len(eval_requests) == 0:
51
- return
52
 
53
  eval_request = eval_requests[0]
54
  pp.pprint(eval_request)
@@ -56,33 +141,17 @@ def run_auto_eval():
56
  set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
57
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
58
 
59
- # results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
60
- # batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
61
-
62
- TASKS_HARNESS = [task.value for task in Tasks]
63
-
64
- print(f'Device: {DEVICE}')
65
-
66
  for task in TASKS_HARNESS:
67
- results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
68
- batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
69
-
70
- dumped = json.dumps(results, indent=2)
71
- print(dumped)
72
-
73
- output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
74
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
75
- with open(output_path, "w") as f:
76
- f.write(dumped)
77
-
78
- API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
79
- repo_id=RESULTS_REPO, repo_type="dataset")
80
 
81
  set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
82
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
83
 
84
- # breakpoint()
85
 
86
 
87
  if __name__ == "__main__":
88
- run_auto_eval()
 
 
 
 
8
  from src.backend.run_eval_suite import run_evaluation
9
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
  from src.backend.sort_queue import sort_models_by_priority
11
+ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
12
+
13
+ from src.backend.manage_requests import EvalRequest
14
+ from src.leaderboard.read_evals import EvalResult
15
 
16
  from src.envs import QUEUE_REPO, RESULTS_REPO, API
17
 
18
  import logging
19
  import pprint
20
 
 
 
21
  logging.getLogger("openai").setLevel(logging.WARNING)
22
 
23
  logging.basicConfig(level=logging.ERROR)
 
28
  FINISHED_STATUS = "FINISHED"
29
  FAILED_STATUS = "FAILED"
30
 
31
+ TASKS_HARNESS = [task.value for task in Tasks]
32
+
33
  snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
34
  snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
35
 
36
 
37
+ def sanity_checks():
38
+ print(f'Device: {DEVICE}')
39
 
40
  # pull the eval dataset from the hub and parse any eval requests
41
  # check completed evals and set them to finished
42
  check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
43
  failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
44
  hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
45
+ return
46
+
47
+
48
+ def request_to_result_name(request: EvalRequest) -> str:
49
+ # Request: EvalRequest(model='meta-llama/Llama-2-13b-hf', private=False, status='FINISHED',
50
+ # json_filepath='./eval-queue-bk/meta-llama/Llama-2-13b-hf_eval_request_False_False_False.json',
51
+ # weight_type='Original', model_type='pretrained', precision='float32', base_model='', revision='main',
52
+ # submitted_time='2023-09-09T10:52:17Z', likes=389, params=13.016, license='?')
53
+ #
54
+ # EvalResult(eval_name='meta-llama_Llama-2-13b-hf_float32', full_model='meta-llama/Llama-2-13b-hf',
55
+ # org='meta-llama', model='Llama-2-13b-hf', revision='main',
56
+ # results={'nq_open': 33.739612188365655, 'triviaqa': 74.12505572893447},
57
+ # precision=<Precision.float32: ModelDetails(name='float32', symbol='')>,
58
+ # model_type=<ModelType.PT: ModelDetails(name='pretrained', symbol='🟢')>,
59
+ # weight_type=<WeightType.Original: ModelDetails(name='Original', symbol='')>,
60
+ # architecture='LlamaForCausalLM', license='?', likes=389, num_params=13.016, date='2023-09-09T10:52:17Z', still_on_hub=True)
61
+ #
62
+ org_and_model = request.model.split("/", 1)
63
+ if len(org_and_model) == 1:
64
+ model = org_and_model[0]
65
+ res = f"{model}_{request.precision}"
66
+ else:
67
+ org = org_and_model[0]
68
+ model = org_and_model[1]
69
+ res = f"{org}_{model}_{request.precision}"
70
+ return res
71
+
72
+
73
+ def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
74
+ results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
75
+ batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
76
+
77
+ dumped = json.dumps(results, indent=2)
78
+ print(dumped)
79
+
80
+ output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
81
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
82
+ with open(output_path, "w") as f:
83
+ f.write(dumped)
84
+
85
+ API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
86
+ repo_id=RESULTS_REPO, repo_type="dataset")
87
+ return results
88
+
89
+
90
+ def process_finished_requests() -> bool:
91
+ sanity_checks()
92
+
93
+ current_finished_status = [FINISHED_STATUS]
94
+
95
+ # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
96
+ eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
97
+ # Sort the evals by priority (first submitted first run)
98
+ eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
99
+
100
+ from src.leaderboard.read_evals import get_raw_eval_results
101
+ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
102
+
103
+ result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
104
+ result_name_to_result = {r.eval_name: r for r in eval_results}
105
+
106
+ for eval_request in eval_requests:
107
+ result_name: str = request_to_result_name(eval_request)
108
+
109
+ # Check the corresponding result
110
+ eval_result: EvalResult = result_name_to_result[result_name]
111
+
112
+ # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
113
+ for task in TASKS_HARNESS:
114
+ task_name = task.benchmark
115
+
116
+ if task_name not in eval_result.results:
117
+ results = process_evaluation(task, eval_request)
118
+ return True
119
+
120
+ return False
121
+
122
+
123
+ def process_pending_requests() -> bool:
124
+ sanity_checks()
125
+
126
+ current_pending_status = [PENDING_STATUS]
127
 
128
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
129
  eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
133
  print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
134
 
135
  if len(eval_requests) == 0:
136
+ return False
137
 
138
  eval_request = eval_requests[0]
139
  pp.pprint(eval_request)
 
141
  set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
142
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
143
 
 
 
 
 
 
 
 
144
  for task in TASKS_HARNESS:
145
+ results = process_evaluation(task, eval_request)
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
148
  local_dir=EVAL_REQUESTS_PATH_BACKEND)
149
 
150
+ return True
151
 
152
 
153
  if __name__ == "__main__":
154
+ res = process_pending_requests()
155
+
156
+ if res is False:
157
+ res = process_finished_requests()
src/backend/manage_requests.py CHANGED
@@ -112,3 +112,4 @@ def check_completed_evals(api: HfApi, hf_repo: str, local_dir: str, checked_stat
112
  else:
113
  print(f"No result file found for {model} setting it to {failed_status}")
114
  set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
 
 
112
  else:
113
  print(f"No result file found for {model} setting it to {failed_status}")
114
  set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
115
+
src/backend/run_eval_suite.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
  logging.getLogger("openai").setLevel(logging.WARNING)
7
 
8
 
9
- def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, no_cache=True, limit=None):
10
  if limit:
11
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
12
 
@@ -14,18 +14,11 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
14
 
15
  print(f"Selected Tasks: {task_names}")
16
 
17
- results = evaluator.simple_evaluate(
18
- model="hf-causal-experimental", # "hf-causal"
19
- model_args=eval_request.get_model_args(),
20
- tasks=task_names,
21
- num_fewshot=num_fewshot,
22
- batch_size=batch_size,
23
- device=device,
24
- no_cache=no_cache,
25
- limit=limit,
26
- write_out=True,
27
- output_base_path="logs"
28
- )
29
 
30
  results["config"]["model_dtype"] = eval_request.precision
31
  results["config"]["model_name"] = eval_request.model
 
6
  logging.getLogger("openai").setLevel(logging.WARNING)
7
 
8
 
9
+ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, no_cache=True, limit=None) -> dict:
10
  if limit:
11
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
12
 
 
14
 
15
  print(f"Selected Tasks: {task_names}")
16
 
17
+ results = evaluator.simple_evaluate(model="hf-causal-experimental", # "hf-causal"
18
+ model_args=eval_request.get_model_args(),
19
+ tasks=task_names, num_fewshot=num_fewshot,
20
+ batch_size=batch_size, device=device, no_cache=no_cache,
21
+ limit=limit, write_out=True, output_base_path="logs")
 
 
 
 
 
 
 
22
 
23
  results["config"]["model_dtype"] = eval_request.precision
24
  results["config"]["model_name"] = eval_request.model
src/leaderboard/read_evals.py CHANGED
@@ -31,8 +31,8 @@ class EvalResult:
31
  date: str = "" # submission date of request file
32
  still_on_hub: bool = False
33
 
34
- @classmethod
35
- def init_from_json_file(self, json_filepath):
36
  """Inits the result from the specific model result file"""
37
  with open(json_filepath) as fp:
38
  data = json.load(fp)
@@ -93,7 +93,7 @@ class EvalResult:
93
  mean_acc = np.mean(accs) * 100.0
94
  results[task.benchmark] = mean_acc
95
 
96
- print(json_filepath, results)
97
 
98
  # XXX
99
  # if 'nq_open' not in results:
@@ -103,9 +103,9 @@ class EvalResult:
103
  # if 'triviaqa' not in results:
104
  # results['triviaqa'] = 0.0
105
 
106
- return self(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
107
- precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
108
- architecture=architecture)
109
 
110
  def update_with_request_file(self, requests_path):
111
  """Finds the relevant request file for the current model and updates info with it"""
@@ -210,7 +210,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
210
  results = []
211
  for v in eval_results.values():
212
  try:
213
- v.to_dict() # we test if the dict version is complete
214
  results.append(v)
215
  except KeyError: # not all eval values present
216
  continue
 
31
  date: str = "" # submission date of request file
32
  still_on_hub: bool = False
33
 
34
+ @staticmethod
35
+ def init_from_json_file(json_filepath):
36
  """Inits the result from the specific model result file"""
37
  with open(json_filepath) as fp:
38
  data = json.load(fp)
 
93
  mean_acc = np.mean(accs) * 100.0
94
  results[task.benchmark] = mean_acc
95
 
96
+ # print(json_filepath, results)
97
 
98
  # XXX
99
  # if 'nq_open' not in results:
 
103
  # if 'triviaqa' not in results:
104
  # results['triviaqa'] = 0.0
105
 
106
+ return EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
107
+ precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
108
+ architecture=architecture)
109
 
110
  def update_with_request_file(self, requests_path):
111
  """Finds the relevant request file for the current model and updates info with it"""
 
210
  results = []
211
  for v in eval_results.values():
212
  try:
213
+ v.to_dict() # we test if the dict version is complete
214
  results.append(v)
215
  except KeyError: # not all eval values present
216
  continue