import json from util.assistants import GPTAgent import json_repair class evaluator: def __init__(self, model_name='GPT4-turbo'): self.model = GPTAgent(model_name) def validate_scores(self, scores): required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"] for key in required_keys: if key not in scores or not isinstance(scores[key], (int, float)) or not (-1 <= scores[key] <= 1): return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1} return scores def evaluate_single(self, question,explanation): evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied. Question: {question} Provided Explanation: {explanation} Evaluation Criteria: Factually Correct: Definition: The explanation must be accurate and relevant to the question and the subject matter. Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question. Useful: Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making. Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions? Context Specific: Definition: The explanation should be relevant to the specific context or scenario implied by the question. Score: (0-1) How well does the explanation address the specific context or scenario of the question? User Specific: Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics. Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user? Provides Pluralism: Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives. Score: (0-1) How well does the explanation provide or support multiple perspectives? After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text. Example JSON format: Answer:{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}} Answer: """ response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip() #response = """{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}""" print(response) try: scores = json.loads(response) except json.JSONDecodeError: # Attempt to repair the JSON if decoding fails repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False) try: scores = json.loads(repaired_json) except json.JSONDecodeError: print("Failed to decode JSON response even after repair attempt. Skipping this batch.") return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1} return self.validate_scores(scores) def format_conversation(self, conversation): formatted_conversation = "\n".join( f"{exchange['role'].capitalize()}: {exchange['content']}" for exchange in conversation ) return formatted_conversation def evaluate_conversation(self, conversation, context): formatted_conversation = self.format_conversation(conversation) evaluation_prompt = f""" You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied. Conversation: {formatted_conversation} Context: {context} Evaluation Criteria: Factually Correct: Definition: The explanation must be accurate and relevant to the question and the subject matter. Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question. Useful: Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making. Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions? Context Specific: Definition: The explanation should be relevant to the specific context or scenario implied by the question. Score: (0-1) How well does the explanation address the specific context or scenario of the question? User Specific: Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics. Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user? Provides Pluralism: Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives. Score: (0-1) How well does the explanation provide or support multiple perspectives? After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text. Example JSON format: Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}} Answer: """ print(evaluation_prompt) response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip() try: scores = json.loads(response) except json.JSONDecodeError: repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False) try: scores = json.loads(repaired_json) except json.JSONDecodeError: print("Failed to decode JSON response even after repair attempt. Skipping this batch.") return {key: -1 for key in ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]} return self.validate_scores(scores) def write_evaluation_commentary(scores): evaluation_details = [] for principle, score in scores.items(): if score == -1: evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'}) continue if principle == "Factually Correct": if score >= 0.8: comment = "Excellent accuracy! The information is precise and directly relevant to the question." elif score >= 0.5: comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant." else: comment = "The explanation contains significant inaccuracies or irrelevant information." elif principle == "Useful": if score >= 0.8: comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making." elif score >= 0.5: comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding." else: comment = "The explanation does little to help understand or apply the information provided." elif principle == "Context Specific": if score >= 0.8: comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively." elif score >= 0.5: comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question." else: comment = "Fails to address the context of the question, lacking relevance or specificity." elif principle == "User Specific": if score >= 0.8: comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness." elif score >= 0.5: comment = "Moderately considerate of the user's knowledge level, but could be more tailored." else: comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest." elif principle == "Provides Pluralism": if score >= 0.8: comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding." elif score >= 0.5: comment = "Offers some alternative perspectives, but more could be provided to enrich understanding." else: comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic." evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment}) return evaluation_details if __name__ == '__main__': eval = evaluator() conversation = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who won the world series in 2020?"}, {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}, {"role": "user", "content": "Where was it played?"} ] context = "general user, user_background is sports enthusiast" results = eval.evaluate_conversation(conversation, context) print(results) print(write_evaluation_commentary(results))