Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,345 Bytes
2561b63 73d1e6e e75cd03 f21645c e75cd03 73d1e6e e75cd03 2561b63 e75cd03 2561b63 73d1e6e 2561b63 e75cd03 2561b63 73d1e6e 95cc038 3d44a49 5ca644e e75cd03 95cc038 73d1e6e b79c971 e1b962a 2561b63 e75cd03 2561b63 e1b962a e75cd03 2561b63 b79c971 95cc038 e75cd03 b79c971 e1b962a 2561b63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
#!/usr/bin/env python
from huggingface_hub import snapshot_download
from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND
from src.backend.manage_requests import get_eval_requests
from src.backend.manage_requests import EvalRequest
from src.backend.run_eval_suite import run_evaluation
from src.backend.tasks.xsum.task import XSum
from src.backend.tasks.xsum.task_v2 import XSumv2
from src.backend.tasks.cnndm.task import CNNDM
from src.backend.tasks.cnndm.task_v2 import CNNDMv2
from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
from lm_eval.tasks import TaskManager
from lm_eval import tasks, evaluator, utils
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
from src.envs import QUEUE_REPO
from lm_eval.models.huggingface import HFLM
def main():
# snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
PENDING_STATUS = "PENDING"
RUNNING_STATUS = "RUNNING"
FINISHED_STATUS = "FINISHED"
FAILED_STATUS = "FAILED"
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status,
hf_repo=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH_BACKEND,
do_download=False)
# eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
eval_request = [r for r in eval_requests if 'meta-llama/Llama-2-7b-hf' in r.model][0]
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
# my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
# my_task = Task("nq_swap", "exact_match", "NQ-Swap", 2)
# my_task = Task("memo-trap_v2", "acc", "XXX", 2)
my_task = Task("xsum_v2", "rougeL", "XXX", 0)
# my_task = Task("squadv2", "exact", "XXX", 0)
# my_task = Task("scrolls_qasper", "f1", "XXX", 0)
eval_logger = utils.eval_logger
import logging
eval_logger.setLevel(getattr(logging, "DEBUG"))
TASKS_HARNESS = [my_task]
# task_names = ['triviaqa']
# TASKS_HARNESS = [task.value for task in Tasks]
# include_task_folder("src/backend/tasks/")
task_manager = TaskManager(include_path="./src/backend/tasks/")
# task_manager.initialize_tasks(include_path="src/backend/tasks/")
# breakpoint()
print(task_manager.all_tasks)
for task in TASKS_HARNESS:
print(f"Selected Tasks: [{task}]")
import torch
# breakpoint()
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
batch_size=1, device="mps", use_cache=None, limit=2, write_out=True, task_manager=task_manager)
print('AAA', results["results"])
breakpoint()
if __name__ == "__main__":
main()
|