Spaces:
Runtime error
Runtime error
File size: 6,485 Bytes
894c4b4 669da77 894c4b4 669da77 894c4b4 669da77 894c4b4 669da77 894c4b4 669da77 894c4b4 6c79b12 669da77 894c4b4 669da77 894c4b4 669da77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import os
import json
from datetime import datetime
from huggingface_hub import snapshot_download
from src.backend.run_eval_suite import run_evaluation
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
from src.backend.sort_queue import sort_models_by_priority
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
from src.backend.manage_requests import EvalRequest
from src.leaderboard.read_evals import EvalResult
from src.envs import QUEUE_REPO, RESULTS_REPO, API
import logging
import pprint
logging.getLogger("openai").setLevel(logging.WARNING)
logging.basicConfig(level=logging.ERROR)
pp = pprint.PrettyPrinter(width=80)
PENDING_STATUS = "PENDING"
RUNNING_STATUS = "RUNNING"
FINISHED_STATUS = "FINISHED"
FAILED_STATUS = "FAILED"
TASKS_HARNESS = [task.value for task in Tasks]
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
def sanity_checks():
print(f'Device: {DEVICE}')
# pull the eval dataset from the hub and parse any eval requests
# check completed evals and set them to finished
check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
return
def request_to_result_name(request: EvalRequest) -> str:
# Request: EvalRequest(model='meta-llama/Llama-2-13b-hf', private=False, status='FINISHED',
# json_filepath='./eval-queue-bk/meta-llama/Llama-2-13b-hf_eval_request_False_False_False.json',
# weight_type='Original', model_type='pretrained', precision='float32', base_model='', revision='main',
# submitted_time='2023-09-09T10:52:17Z', likes=389, params=13.016, license='?')
#
# EvalResult(eval_name='meta-llama_Llama-2-13b-hf_float32', full_model='meta-llama/Llama-2-13b-hf',
# org='meta-llama', model='Llama-2-13b-hf', revision='main',
# results={'nq_open': 33.739612188365655, 'triviaqa': 74.12505572893447},
# precision=<Precision.float32: ModelDetails(name='float32', symbol='')>,
# model_type=<ModelType.PT: ModelDetails(name='pretrained', symbol='🟢')>,
# weight_type=<WeightType.Original: ModelDetails(name='Original', symbol='')>,
# architecture='LlamaForCausalLM', license='?', likes=389, num_params=13.016, date='2023-09-09T10:52:17Z', still_on_hub=True)
#
org_and_model = request.model.split("/", 1)
if len(org_and_model) == 1:
model = org_and_model[0]
res = f"{model}_{request.precision}"
else:
org = org_and_model[0]
model = org_and_model[1]
res = f"{org}_{model}_{request.precision}"
return res
def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
batch_size=1, device=DEVICE, no_cache=True, limit=LIMIT)
dumped = json.dumps(results, indent=2)
print(dumped)
output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
f.write(dumped)
API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
repo_id=RESULTS_REPO, repo_type="dataset")
return results
def process_finished_requests() -> bool:
sanity_checks()
current_finished_status = [FINISHED_STATUS]
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
# Sort the evals by priority (first submitted first run)
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
from src.leaderboard.read_evals import get_raw_eval_results
eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
result_name_to_result = {r.eval_name: r for r in eval_results}
for eval_request in eval_requests:
result_name: str = request_to_result_name(eval_request)
# Check the corresponding result
eval_result: EvalResult = result_name_to_result[result_name]
# Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
for task in TASKS_HARNESS:
task_name = task.benchmark
if task_name not in eval_result.results:
results = process_evaluation(task, eval_request)
return True
return False
def process_pending_requests() -> bool:
sanity_checks()
current_pending_status = [PENDING_STATUS]
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
# Sort the evals by priority (first submitted first run)
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
if len(eval_requests) == 0:
return False
eval_request = eval_requests[0]
pp.pprint(eval_request)
set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND)
for task in TASKS_HARNESS:
results = process_evaluation(task, eval_request)
set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND)
return True
if __name__ == "__main__":
res = process_pending_requests()
if res is False:
res = process_finished_requests()
|