Spaces:
Sleeping
Sleeping
File size: 3,371 Bytes
08ae6c5 750f3a2 08ae6c5 0c0a603 08ae6c5 5c763da 8b88d2c 08ae6c5 9b14fa5 08ae6c5 8b88d2c 08ae6c5 9b14fa5 750f3a2 9b14fa5 7689092 9b14fa5 7689092 08ae6c5 9b14fa5 08ae6c5 0c0a603 9b14fa5 5c763da 9b14fa5 0c0a603 1d6da9d 0c0a603 cb1c891 0c0a603 750f3a2 9b14fa5 0c0a603 1d6da9d 0c0a603 1d6da9d 8b88d2c 1d6da9d 9b14fa5 0c0a603 2fbdc8b 08ae6c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import json
import logging
import re
from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.models.model_config import InferenceEndpointModelConfig
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
from src.backend.manage_requests import EvalRequest
from src.envs import OWNER
from src.logging import setup_logger
logging.getLogger("openai").setLevel(logging.WARNING)
logger = setup_logger(__name__)
SPECIAL_CHARACTERS_PATTERN = re.compile(r"[^a-zA-Z0-9-]")
def run_evaluation(
eval_request: EvalRequest,
task_names: str,
batch_size: int,
local_dir: str,
accelerator: str,
region: str,
vendor: str,
instance_size: str,
instance_type: str,
limit=None,
):
"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
Args:
eval_request (EvalRequest): Input evaluation request file representation
task_names (list): Tasks to launch
batch_size (int): Selected batch size
accelerator (str): Inference endpoint parameter for running the evaluation
region (str): Inference endpoint parameter for running the evaluation
vendor (str): Inference endpoint parameter for running the evaluation
instance_size (str): Inference endpoint parameter for running the evaluation
instance_type (str): Inference endpoint parameter for running the evaluation
local_dir (str): Where to save the results locally
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
"""
if limit:
logger.info(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
evaluation_tracker = EvaluationTracker(
output_dir="./results",
save_details=True,
push_to_hub=True,
push_to_tensorboard=False,
hub_results_org=OWNER,
public=False,
)
pipeline_params = PipelineParameters(
launcher_type=ParallelismManager.ACCELERATE,
override_batch_size=batch_size,
max_samples=limit,
use_chat_template=False,
system_prompt=None,
# custom_tasks_directory="custom_tasks.py", # TODO: pass if using a custom task
)
model_config = InferenceEndpointModelConfig(
# Endpoint parameters
name=SPECIAL_CHARACTERS_PATTERN.sub("-", eval_request.model.lower()),
repository=eval_request.model,
accelerator=accelerator,
vendor=vendor,
region=region,
instance_size=instance_size,
instance_type=instance_type,
should_reuse_existing=False,
model_dtype=eval_request.precision,
revision=eval_request.revision,
)
pipeline = Pipeline(
tasks=task_names,
pipeline_parameters=pipeline_params,
evaluation_tracker=evaluation_tracker,
model_config=model_config,
)
try:
pipeline.evaluate()
pipeline.show_results()
pipeline.save_and_push_results()
results = pipeline.get_results()
dumped = json.dumps(results, indent=2)
logger.info(dumped)
except Exception: # if eval failed, we force a cleanup
pipeline.model.cleanup()
raise
return results
|