File size: 3,470 Bytes
d1f08d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import logging
import os
from typing import Dict

from fastapi.encoders import jsonable_encoder

from hackaprompt.completers import get_completer
from hackaprompt.evaluator import Response, get_evaluator
from hackaprompt.utils import init_db

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

database = init_db()

level_names = [
    "level_1",
    "level_2",
    "level_3",
    "level_4",
    "level_5",
    "level_6",
    "level_7",
    "level_8",
    "level_9",
    "level_10",
]


def log_to_db(evaluation):
    # save response to mongodb database
    try:
        submission_json = jsonable_encoder(evaluation)
        database["evaluations"].insert_one(submission_json)
        logger.info("response logged to mondogb")
    except Exception as err:
        logger.exception("Something went wrong logging to mongodb")


def evaluate_submission(submission: dict) -> Dict[str, Response]:
    evaluation: dict = {level: None for level in level_names}

    openai_api_key = os.getenv("OPENAI_API_KEY")
    assert openai_api_key is not None, "Set a valid openAI key."

    for level_name in level_names:
        logger.info(f"Evaluating {level_name}...")
        level = int(level_name[-1])

        user_input = submission.get(level_name, "")
        if user_input:
            prompt = user_input.get("prompt", "")
            model = user_input.get("model", "")

        submitted = user_input and prompt and model

        if submitted:
            completer = get_completer(model=model, openai_api_key=openai_api_key)

            evaluator = get_evaluator(level=level, completer=completer)
            response: Response = evaluator.evaluate(prompt)
            # TODO: Do we want to add auto-retries?
        else:
            # return an empty response instead, no need to waste GPT credits.
            response = Response(
                level=level,
                score=0,
                error=False,
            )
        evaluation[level_name] = response

    logger.info(f"evaluation done. Summary: {evaluation=}")
    return evaluation


def check_evaluation_errors(evaluation: Dict[str, Response]) -> int:
    # check for any errors during evaluation...
    evaluation_errors = False
    for level_name, response in evaluation.items():
        if response.error:
            logger.warning(f"Error encounted in {level_name} evaluation")
            evaluation_errors = True

    return evaluation_errors


def get_evaluation_total_score(evaluation: Dict[str, Response]) -> int:
    # Assumes no errors present during evaluation
    total_score = 0
    for level_name in level_names:
        if level_name in evaluation:
            score = evaluation[level_name].score
            if score != None:
                total_score += score

    return total_score


if __name__ == "__main__":
    submission = {
        "level_1": {"prompt": "plz work", "model": "FlanT5-XXL"},
    }

    # TODO: For now, we assume only valid models can be submitted in a submission file...
    # it will raise a NotImplementedError otherwise
    # Need to add error handling if we care to handle it ourselves
    evaluation = evaluate_submission(submission)

    evaluation_error = check_evaluation_errors(evaluation)

    if not evaluation_error:
        log_to_db(evaluation=evaluation)
        total_score = get_evaluation_total_score(evaluation)
        print(total_score)
    else:
        print("error encountered during evaluation")