pminervini commited on
Commit
894c4b4
1 Parent(s): e504efd
backend-cli.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ from datetime import datetime
5
+
6
+ from huggingface_hub import snapshot_download
7
+
8
+ from src.backend.run_eval_suite import run_evaluation
9
+ from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
+ from src.backend.sort_queue import sort_models_by_priority
11
+ from src.backend.envs import Tasks, NUM_FEWSHOT, EVAL_REQUESTS_PATH_BACKEND,EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT
12
+
13
+ from src.envs import QUEUE_REPO, RESULTS_REPO, API
14
+
15
+ import logging
16
+ import pprint
17
+
18
+ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
19
+
20
+ logging.getLogger("openai").setLevel(logging.WARNING)
21
+
22
+ logging.basicConfig(level=logging.ERROR)
23
+ pp = pprint.PrettyPrinter(width=80)
24
+
25
+ PENDING_STATUS = "PENDING"
26
+ RUNNING_STATUS = "RUNNING"
27
+ FINISHED_STATUS = "FINISHED"
28
+ FAILED_STATUS = "FAILED"
29
+
30
+ snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
31
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
32
+
33
+
34
+ def run_auto_eval():
35
+ current_pending_status = [PENDING_STATUS]
36
+
37
+ # pull the eval dataset from the hub and parse any eval requests
38
+ # check completed evals and set them to finished
39
+ check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
40
+ failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
41
+ hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
42
+
43
+ # Get all eval request that are PENDING, if you want to run other evals, change this parameter
44
+ eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
45
+ # Sort the evals by priority (first submitted first run)
46
+ eval_requests = sort_models_by_priority(api=API, models=eval_requests)
47
+
48
+ print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
49
+
50
+ if len(eval_requests) == 0:
51
+ return
52
+
53
+ eval_request = eval_requests[0]
54
+ pp.pprint(eval_request)
55
+
56
+ set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
57
+ local_dir=EVAL_REQUESTS_PATH_BACKEND)
58
+
59
+ results = run_evaluation(eval_request=eval_request, task_names=TASKS_HARNESS, num_fewshot=NUM_FEWSHOT,
60
+ batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
61
+
62
+ dumped = json.dumps(results, indent=2)
63
+ print(dumped)
64
+
65
+ output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
66
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
67
+ with open(output_path, "w") as f:
68
+ f.write(dumped)
69
+
70
+ API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
71
+ repo_id=RESULTS_REPO, repo_type="dataset")
72
+
73
+ set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
74
+ local_dir=EVAL_REQUESTS_PATH_BACKEND)
75
+
76
+ # breakpoint()
77
+
78
+
79
+ if __name__ == "__main__":
80
+ run_auto_eval()
src/backend/envs.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+
8
+ from src.envs import CACHE_PATH
9
+
10
+
11
+ @dataclass
12
+ class Task:
13
+ benchmark: str
14
+ metric: str
15
+ col_name: str
16
+
17
+
18
+ class Tasks(Enum):
19
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
20
+ # task0 = Task("anli_r1", "acc", "ANLI")
21
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
22
+ task0 = Task("nq_open", "em", "NQ Open")
23
+ task1 = Task("triviaqa", "em", "TriviaQA")
24
+
25
+
26
+ NUM_FEWSHOT = 64 # Change with your few shot
27
+
28
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
29
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
30
+
31
+ DEVICE = "cuda:0" if torch.cuda.is_available() else 'cpu'
32
+
33
+ LIMIT = 32 # Testing; needs to be None
src/backend/manage_requests.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from huggingface_hub import HfApi, snapshot_download
7
+
8
+ @dataclass
9
+ class EvalRequest:
10
+ model: str
11
+ private: bool
12
+ status: str
13
+ json_filepath: str
14
+ weight_type: str = "Original"
15
+ model_type: str = "" # pretrained, finetuned, with RL
16
+ precision: str = "" # float16, bfloat16
17
+ base_model: Optional[str] = None # for adapter models
18
+ revision: str = "main" # commit
19
+ submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
20
+ model_type: Optional[str] = None
21
+ likes: Optional[int] = 0
22
+ params: Optional[int] = None
23
+ license: Optional[str] = ""
24
+
25
+ def get_model_args(self):
26
+ model_args = f"pretrained={self.model},revision={self.revision}"
27
+
28
+ if self.precision in ["float16", "float32", "bfloat16"]:
29
+ model_args += f",dtype={self.precision}"
30
+ # Quantized models need some added config, the install of bits and bytes, etc
31
+ #elif self.precision == "8bit":
32
+ # model_args += ",load_in_8bit=True"
33
+ #elif self.precision == "4bit":
34
+ # model_args += ",load_in_4bit=True"
35
+ #elif self.precision == "GPTQ":
36
+ # A GPTQ model does not need dtype to be specified,
37
+ # it will be inferred from the config
38
+ pass
39
+ else:
40
+ raise Exception(f"Unknown precision {self.precision}.")
41
+
42
+ return model_args
43
+
44
+
45
+ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
46
+ """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
47
+ json_filepath = eval_request.json_filepath
48
+
49
+ with open(json_filepath) as fp:
50
+ data = json.load(fp)
51
+
52
+ data["status"] = set_to_status
53
+
54
+ with open(json_filepath, "w") as f:
55
+ f.write(json.dumps(data))
56
+
57
+ api.upload_file(
58
+ path_or_fileobj=json_filepath,
59
+ path_in_repo=json_filepath.replace(local_dir, ""),
60
+ repo_id=hf_repo,
61
+ repo_type="dataset",
62
+ )
63
+
64
+
65
+ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
66
+ """Get all pending evaluation requests and return a list in which private
67
+ models appearing first, followed by public models sorted by the number of
68
+ likes.
69
+
70
+ Returns:
71
+ `list[EvalRequest]`: a list of model info dicts.
72
+ """
73
+ snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
74
+ json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
75
+
76
+ eval_requests = []
77
+ for json_filepath in json_files:
78
+ with open(json_filepath) as fp:
79
+ data = json.load(fp)
80
+ if data["status"] in job_status:
81
+ # import pdb
82
+ # breakpoint()
83
+ data["json_filepath"] = json_filepath
84
+
85
+ del data['job_id']
86
+
87
+ eval_request = EvalRequest(**data)
88
+ eval_requests.append(eval_request)
89
+
90
+ return eval_requests
91
+
92
+
93
+ def check_completed_evals(
94
+ api: HfApi,
95
+ hf_repo: str,
96
+ local_dir: str,
97
+ checked_status: str,
98
+ completed_status: str,
99
+ failed_status: str,
100
+ hf_repo_results: str,
101
+ local_dir_results: str,
102
+ ):
103
+ """Checks if the currently running evals are completed, if yes, update their status on the hub."""
104
+ snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
105
+
106
+ running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
107
+
108
+ for eval_request in running_evals:
109
+ model = eval_request.model
110
+ print("====================================")
111
+ print(f"Checking {model}")
112
+
113
+ output_path = model
114
+ output_file = f"{local_dir_results}/{output_path}/results*.json"
115
+ output_file_exists = len(glob.glob(output_file)) > 0
116
+
117
+ if output_file_exists:
118
+ print(
119
+ f"EXISTS output file exists for {model} setting it to {completed_status}"
120
+ )
121
+ set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
122
+ else:
123
+ print(
124
+ f"No result file found for {model} setting it to {failed_status}"
125
+ )
126
+ set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/run_eval_suite.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval import tasks, evaluator, utils
2
+ from src.backend.manage_requests import EvalRequest
3
+
4
+ import logging
5
+
6
+ logging.getLogger("openai").setLevel(logging.WARNING)
7
+
8
+
9
+ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, no_cache=True, limit=None):
10
+ if limit:
11
+ print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
12
+
13
+ task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
14
+
15
+ print(f"Selected Tasks: {task_names}")
16
+
17
+ results = evaluator.simple_evaluate(
18
+ model="hf-causal-experimental", # "hf-causal"
19
+ model_args=eval_request.get_model_args(),
20
+ tasks=task_names,
21
+ num_fewshot=num_fewshot,
22
+ batch_size=batch_size,
23
+ device=device,
24
+ no_cache=no_cache,
25
+ limit=limit,
26
+ write_out=True,
27
+ output_base_path="logs"
28
+ )
29
+
30
+ results["config"]["model_dtype"] = eval_request.precision
31
+ results["config"]["model_name"] = eval_request.model
32
+ results["config"]["model_sha"] = eval_request.revision
33
+
34
+ print(evaluator.make_table(results))
35
+
36
+ return results
src/backend/sort_queue.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from huggingface_hub import HfApi
3
+ from src.backend.manage_requests import EvalRequest
4
+
5
+
6
+ @dataclass
7
+ class ModelMetadata:
8
+ likes: int = 0
9
+ size: int = 15
10
+
11
+
12
+ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
13
+ private_models = [model for model in models if model.private]
14
+ public_models = [model for model in models if not model.private]
15
+
16
+ return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
17
+
18
+
19
+ def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
20
+ return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
21
+
22
+
23
+ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
24
+ return sorted(eval_requests, key=lambda x: x.params, reverse=False)
25
+
26
+
27
+ def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
28
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)