Spaces:
Running
Running
File size: 3,470 Bytes
d1f08d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import logging
import os
from typing import Dict
from fastapi.encoders import jsonable_encoder
from hackaprompt.completers import get_completer
from hackaprompt.evaluator import Response, get_evaluator
from hackaprompt.utils import init_db
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
database = init_db()
level_names = [
"level_1",
"level_2",
"level_3",
"level_4",
"level_5",
"level_6",
"level_7",
"level_8",
"level_9",
"level_10",
]
def log_to_db(evaluation):
# save response to mongodb database
try:
submission_json = jsonable_encoder(evaluation)
database["evaluations"].insert_one(submission_json)
logger.info("response logged to mondogb")
except Exception as err:
logger.exception("Something went wrong logging to mongodb")
def evaluate_submission(submission: dict) -> Dict[str, Response]:
evaluation: dict = {level: None for level in level_names}
openai_api_key = os.getenv("OPENAI_API_KEY")
assert openai_api_key is not None, "Set a valid openAI key."
for level_name in level_names:
logger.info(f"Evaluating {level_name}...")
level = int(level_name[-1])
user_input = submission.get(level_name, "")
if user_input:
prompt = user_input.get("prompt", "")
model = user_input.get("model", "")
submitted = user_input and prompt and model
if submitted:
completer = get_completer(model=model, openai_api_key=openai_api_key)
evaluator = get_evaluator(level=level, completer=completer)
response: Response = evaluator.evaluate(prompt)
# TODO: Do we want to add auto-retries?
else:
# return an empty response instead, no need to waste GPT credits.
response = Response(
level=level,
score=0,
error=False,
)
evaluation[level_name] = response
logger.info(f"evaluation done. Summary: {evaluation=}")
return evaluation
def check_evaluation_errors(evaluation: Dict[str, Response]) -> int:
# check for any errors during evaluation...
evaluation_errors = False
for level_name, response in evaluation.items():
if response.error:
logger.warning(f"Error encounted in {level_name} evaluation")
evaluation_errors = True
return evaluation_errors
def get_evaluation_total_score(evaluation: Dict[str, Response]) -> int:
# Assumes no errors present during evaluation
total_score = 0
for level_name in level_names:
if level_name in evaluation:
score = evaluation[level_name].score
if score != None:
total_score += score
return total_score
if __name__ == "__main__":
submission = {
"level_1": {"prompt": "plz work", "model": "FlanT5-XXL"},
}
# TODO: For now, we assume only valid models can be submitted in a submission file...
# it will raise a NotImplementedError otherwise
# Need to add error handling if we care to handle it ourselves
evaluation = evaluate_submission(submission)
evaluation_error = check_evaluation_errors(evaluation)
if not evaluation_error:
log_to_db(evaluation=evaluation)
total_score = get_evaluation_total_score(evaluation)
print(total_score)
else:
print("error encountered during evaluation")
|