File size: 3,697 Bytes
08ae6c5
 
 
 
f72e694
08ae6c5
1d6da9d
7798457
08ae6c5
eb2a0ba
45e5a75
67fde66
08ae6c5
 
 
72bd0af
 
 
 
08ae6c5
 
 
 
72bd0af
9378bc3
 
 
87ef24d
9378bc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d6da9d
 
 
2e91fe1
 
1d6da9d
 
7798457
1d6da9d
 
 
d876fb6
 
9378bc3
f72e694
1d6da9d
 
67fde66
 
 
 
 
 
 
 
 
 
1d6da9d
eb2a0ba
1d6da9d
51bfeec
7798457
 
1d6da9d
73adc1c
7798457
 
d0d5565
 
7798457
d0d5565
08ae6c5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import json
import os
import logging
from datetime import datetime
from argparse import Namespace

from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
from src.backend.manage_requests import EvalRequest
from lighteval.logging.evaluation_tracker import EnhancedJSONEncoder
from lighteval.models.model_loader import ModelInfo
from huggingface_hub.errors import InferenceEndpointTimeoutError

logging.getLogger("openai").setLevel(logging.WARNING)

class DefaultNamespace(Namespace):
    def __getattr__(self, name):
        return self.__dict__.get(name, None)

def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
    if limit:
        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

    args = DefaultNamespace(**{
            "model_config": dict(model=dict(
                type="endpoint",
                base_params=dict(
                    endpoint_name=f'{eval_request.model.split("/")[1].replace(".", "-").replace("_", "-").lower()}-lighteval'[-32:].strip('-'),
                    model=eval_request.model,
                    revision=eval_request.revision,
                    dtype=eval_request.precision,
                    reuse_existing=False
                ),
                instance=dict(
                    accelerator=accelerator,
                    region=region,
                    vendor=vendor,
                    instance_size=instance_size,
                    instance_type=instance_type,
                    framework='pytorch',
                    endpoint_type='protected',
                    namespace=OWNER
                ),
                generation=dict(
                    add_special_tokens=True
                )
            )),
            "max_samples": limit,
            "job_id": str(datetime.now()),
            "push_results_to_hub": True,
            "save_details": False,
            "push_details_to_hub": False,
            "public_run": False,
            "cache_dir": CACHE_PATH,
            "results_org": OWNER,
            "output_dir": local_dir,
            "override_batch_size": batch_size,
            "custom_tasks": "custom_tasks.py",
            "tasks": task_names,
            "dataset_loading_processes": 24,
            "num_fewshot_seeds": 0
    })

    try:
        # in case of timeout, try it again with reuse_existing
        for i in range(3):
            try:
                results = main(args)
                # if we are i>0, then raise an error so that we call clean up 
                if i > 0: raise Exception() 
            except InferenceEndpointTimeoutError:
                if i < 3: 
                    print('Timed out, trying again...')
                    args.model_config['model']['base_params']['reuse_existing'] = True

        dumped = json.dumps(results, cls=EnhancedJSONEncoder, indent=2)
        print(dumped)
    except Exception as ex: # if eval failed, we force a cleanup
        import traceback
        traceback.print_exception(ex)
        env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
        args.model_config['model']['base_params']['reuse_existing'] = True
        model_config = create_model_config(args=args, accelerator=accelerator)
        model, _ = load_model(config=model_config, env_config=env_config)
        print('Cleaning up')
        model.reuse_existing = False # force it to clean up
        model.cleanup()
        results = None

    return results