maxima-backend

Sleeping

File size: 3,371 Bytes

08ae6c5
 
750f3a2
08ae6c5
0c0a603
 
 
 
08ae6c5
5c763da
8b88d2c
08ae6c5
9b14fa5
08ae6c5
8b88d2c
08ae6c5
9b14fa5
750f3a2
 
 
9b14fa5
 
 
 
 
 
 
 
 
 
 
 
7689092
 
 
 
 
 
 
 
 
 
 
 
 
9b14fa5
7689092
08ae6c5
9b14fa5
 
 
08ae6c5
0c0a603
 
9b14fa5
 
 
5c763da
9b14fa5
0c0a603
1d6da9d
0c0a603
 
 
 
 
 
cb1c891
0c0a603
 
 
 
750f3a2
9b14fa5
 
 
 
 
 
 
 
 
0c0a603
 
 
 
 
 
 
 
1d6da9d
0c0a603
 
 
 
 
1d6da9d
 
8b88d2c
1d6da9d
9b14fa5
0c0a603
2fbdc8b
08ae6c5

import json
import logging
import re

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.models.model_config import InferenceEndpointModelConfig
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters

from src.backend.manage_requests import EvalRequest
from src.envs import OWNER
from src.logging import setup_logger


logging.getLogger("openai").setLevel(logging.WARNING)
logger = setup_logger(__name__)


SPECIAL_CHARACTERS_PATTERN = re.compile(r"[^a-zA-Z0-9-]")


def run_evaluation(
    eval_request: EvalRequest,
    task_names: str,
    batch_size: int,
    local_dir: str,
    accelerator: str,
    region: str,
    vendor: str,
    instance_size: str,
    instance_type: str,
    limit=None,
):
    """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.

    Args:
        eval_request (EvalRequest): Input evaluation request file representation
        task_names (list): Tasks to launch
        batch_size (int): Selected batch size
        accelerator (str): Inference endpoint parameter for running the evaluation
        region (str):  Inference endpoint parameter for running the evaluation
        vendor (str):  Inference endpoint parameter for running the evaluation
        instance_size (str):  Inference endpoint parameter for running the evaluation
        instance_type (str):  Inference endpoint parameter for running the evaluation
        local_dir (str): Where to save the results locally
        limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
    """

    if limit:
        logger.info(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    evaluation_tracker = EvaluationTracker(
        output_dir="./results",
        save_details=True,
        push_to_hub=True,
        push_to_tensorboard=False,
        hub_results_org=OWNER,
        public=False,
    )

    pipeline_params = PipelineParameters(
        launcher_type=ParallelismManager.ACCELERATE,
        override_batch_size=batch_size,
        max_samples=limit,
        use_chat_template=False,
        system_prompt=None,
        # custom_tasks_directory="custom_tasks.py",  # TODO: pass if using a custom task
    )

    model_config = InferenceEndpointModelConfig(
        # Endpoint parameters
        name=SPECIAL_CHARACTERS_PATTERN.sub("-", eval_request.model.lower()),
        repository=eval_request.model,
        accelerator=accelerator,
        vendor=vendor,
        region=region,
        instance_size=instance_size,
        instance_type=instance_type,
        should_reuse_existing=False,
        model_dtype=eval_request.precision,
        revision=eval_request.revision,
    )

    pipeline = Pipeline(
        tasks=task_names,
        pipeline_parameters=pipeline_params,
        evaluation_tracker=evaluation_tracker,
        model_config=model_config,
    )

    try:
        pipeline.evaluate()
        pipeline.show_results()
        pipeline.save_and_push_results()
        results = pipeline.get_results()

        dumped = json.dumps(results, indent=2)
        logger.info(dumped)

    except Exception:  # if eval failed, we force a cleanup
        pipeline.model.cleanup()
        raise

    return results