import re # Used to check our models performance on multiple choice tasks. This can also be done in a more involved way with e.g. LLM-as-a-judge def check_multiple_choice_with_regex(model_outputs, correct_answers): results = [] for model_output, correct_answer in zip(model_outputs, correct_answers): correct_answer = correct_answer.upper() # Look for the answer letter at the beginning of a line or as the last word patterns = [ rf"\b{correct_answer}\b", # Word boundary around the answer letter rf"\b{correct_answer}[.,)]", # Answer followed by punctuation rf"\(.*{correct_answer}.*\)", # Answer within parentheses ] match_found = False for pattern in patterns: if re.search(pattern, model_output): match_found = True break # Exit inner loop once a match is found results.append(match_found) return results