Spaces:
Paused
Paused
File size: 5,299 Bytes
1ffc326 79410f6 18abd06 0f5c75a 1ffc326 08ae6c5 1ffc326 55cc480 1ffc326 0f5c75a 1ffc326 6902167 ca54606 0270220 ca54606 0e63ee0 ae8f4f4 0e63ee0 ca54606 398ca01 0f5c75a 0270220 0f5c75a 08ae6c5 1ffc326 08ae6c5 1ffc326 7135a84 08ae6c5 1ffc326 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import logging
import pprint
from huggingface_hub import snapshot_download
logging.getLogger("openai").setLevel(logging.WARNING)
from src.backend.run_eval_suite_lighteval import run_evaluation
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request, set_requests_seen
from src.backend.sort_queue import sort_models_by_priority
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
from src.about import TASKS_LIGHTEVAL
logging.basicConfig(level=logging.ERROR)
pp = pprint.PrettyPrinter(width=80)
PENDING_STATUS = "PENDING"
RUNNING_STATUS = "RUNNING"
FINISHED_STATUS = "FINISHED"
FAILED_STATUS = "FAILED"
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
def run_auto_eval():
current_pending_status = [PENDING_STATUS]
# pull the eval dataset from the hub and parse any eval requests
# check completed evals and set them to finished
check_completed_evals(
api=API,
checked_status=RUNNING_STATUS,
completed_status=FINISHED_STATUS,
failed_status=FAILED_STATUS,
hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND,
hf_repo_results=RESULTS_REPO,
local_dir_results=EVAL_RESULTS_PATH_BACKEND
)
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
eval_requests, requests_seen = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
# Sort the evals by priority (first submitted first run)
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
if len(eval_requests) == 0:
return
eval_request = eval_requests[0]
pp.pprint(eval_request)
# For GPU
if not eval_request or eval_request.params < 0:
raise ValueError("Couldn't detect number of params, please make sure the metadata is available")
# elif eval_request.params < 4:
# instance_size, instance_type, cap = "x1", "nvidia-a10g", 20
elif eval_request.params < 9:
instance_size, instance_type, cap = "x1", "nvidia-a10g", 35
elif eval_request.params < 24:
instance_size, instance_type, cap = "x4", "nvidia-a10g", 15
else:
set_eval_request(
api=API,
eval_request=eval_request,
set_to_status=FAILED_STATUS,
hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND,
)
pp.pprint(dict(message="Number of params too big, can't run this model", params=eval_request.params))
return
counter_key = f'count_{instance_size}_{instance_type}'
if not counter_key in requests_seen:
requests_seen[counter_key] = 0
if requests_seen[counter_key] >= cap:
set_eval_request(
api=API,
eval_request=eval_request,
set_to_status=FAILED_STATUS,
hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND,
)
pp.pprint(dict(message="Reached maximum cap for requests of this instance type this month", counter=counter_key, instance_type=instance_type, cap=cap))
return
# next, check to see who made the last commit to this repo - keep track of that. One person shouldn't commit more
# than 4 models in one month.
commits = API.list_repo_commits(eval_request.model, revision=eval_request.revision)
users = commits[0].authors
for user in users:
if user in requests_seen and len(requests_seen[user]) >= 4:
set_eval_request(
api=API,
eval_request=eval_request,
set_to_status=FAILED_STATUS,
hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND,
)
pp.pprint(dict(message="Reached maximum cap for requests for this user this month", counter=counter_key, user=user))
return
if not user in requests_seen:
requests_seen[user] = []
requests_seen[user].append(dict(model_id=eval_request.model, revision=eval_request.revision))
requests_seen[counter_key] += 1
set_requests_seen(
api=API,
requests_seen=requests_seen,
hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND
)
set_eval_request(
api=API,
eval_request=eval_request,
set_to_status=RUNNING_STATUS,
hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND,
)
run_evaluation(
eval_request=eval_request,
task_names=TASKS_LIGHTEVAL,
local_dir=EVAL_RESULTS_PATH_BACKEND,
batch_size=25,
accelerator=ACCELERATOR,
region=REGION,
vendor=VENDOR,
instance_size=instance_size,
instance_type=instance_type,
limit=LIMIT
)
if __name__ == "__main__":
run_auto_eval() |