Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
from huggingface_hub import snapshot_download | |
from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND | |
from src.backend.manage_requests import get_eval_requests | |
from src.backend.manage_requests import EvalRequest | |
from src.backend.run_eval_suite import run_evaluation | |
from lm_eval.tasks import initialize_tasks, include_task_folder | |
from lm_eval import tasks, evaluator, utils | |
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task | |
from src.envs import QUEUE_REPO | |
def main(): | |
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60) | |
PENDING_STATUS = "PENDING" | |
RUNNING_STATUS = "RUNNING" | |
FINISHED_STATUS = "FINISHED" | |
FAILED_STATUS = "FAILED" | |
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS] | |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter | |
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND) | |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0] | |
TASKS_HARNESS = [t.value for t in Tasks if 'halueval_qa' in t.value.benchmark] | |
# task_names = ['triviaqa'] | |
# TASKS_HARNESS = [task.value for task in Tasks] | |
include_task_folder("src/backend/tasks/") | |
initialize_tasks('INFO') | |
# breakpoint() | |
print(tasks.ALL_TASKS) | |
for task in TASKS_HARNESS: | |
print(f"Selected Tasks: [{task}]") | |
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=0, | |
batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True) | |
print('AAA', results["results"]) | |
breakpoint() | |
if __name__ == "__main__": | |
main() | |