import json

from util.assistants import GPTAgent
import json_repair

class evaluator:
    def __init__(self, model_name='GPT4-turbo'):
        self.model = GPTAgent(model_name)

    def validate_scores(self, scores):
        required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]

        for key in required_keys:
            if key not in scores:
                return {k: {"Score": -1, "Justification": "Invalid input"} for k in required_keys}

            score_data = scores[key]

            if not isinstance(score_data, dict):
                return {k: {"Score": -1, "Justification": "Invalid input format"} for k in required_keys}

            if "Score" not in score_data or not isinstance(score_data["Score"], (int, float)) or not (
                    0 <= score_data["Score"] <= 10):
                return {k: {"Score": -1, "Justification": "Invalid score value"} for k in required_keys}

            if "Justification" not in score_data or not isinstance(score_data["Justification"], str) or not score_data[
                "Justification"].strip():
                return {k: {"Score": -1, "Justification": "Invalid or missing justification"} for k in required_keys}

        return scores

    def evaluate_single(self, question,explanation):

        evaluation_prompt = f"""You are provided with a user's query and the corresponding explanation generated by 
        an Chatbot. Your task is to evaluate the explanation based on the following five principles. Each principle 
        should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all, 
        and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.
        
        Query: 
        {question}
        
        Provided Explanation: 
        {explanation}
        
        Evaluation Criteria:

        Factually Correct:
        Definition: The explanation must be accurate and relevant to the question and the subject matter.
        Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
        
        Useful:
        Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
        Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?
        
        Context Specific:
        Definition: The explanation should be relevant to the specific context or scenario implied by the question.
        Score: (0-10) How well does the explanation address the specific context or scenario of the question?
        
        User Specific:
        Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
        Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?
        
        Provides Pluralism:
        Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
        Score: (0-10) How well does the explanation provide or support multiple perspectives?
    
        After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
 
        Example JSON format:
        {{
        "Factually Correct": {{
            "Justification": "xxx",
            "Score": 9
        }},
        "Useful": {{
            "Justification": "xxx",
            "Score": 8.5
        }},
        "Context Specific": {{
            "Justification": "xxx",
            "Score": 8
        }},
        "User Specific": {{
            "Justification": "xxx",
            "Score": 7.5
        }},
        "Provides Pluralism": {{
            "Justification": "xxx",
            "Score": 7
        }}
    }}
    
    Answer:
    """

        response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=500).strip()

        print(response)
        try:
            scores = json.loads(response)
        except json.JSONDecodeError:
            # Attempt to repair the JSON if decoding fails
            repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
            try:
                scores = json.loads(repaired_json)
            except json.JSONDecodeError:
                print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
                return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1}


        return self.validate_scores(scores)

    def format_conversation(self, conversation):
        formatted_conversation = "\n".join(
            f"{exchange['role'].capitalize()}: {exchange['content']}" for exchange in conversation
        )
        return formatted_conversation

    def evaluate_conversation(self, conversation, context):
        formatted_conversation = self.format_conversation(conversation)
        evaluation_prompt = f"""
            You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the explanation based on the following five principles. Each principle 
            should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all, 
            and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.
            
            Conversation:
            {formatted_conversation}
    
            Context:
            {context}
    
            Evaluation Criteria:
    
            Factually Correct:
            Definition: The explanation must be accurate and relevant to the question and the subject matter.
            Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
            
            Useful:
            Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
            Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?
            
            Context Specific:
            Definition: The explanation should be relevant to the specific context or scenario implied by the question.
            Score: (0-10) How well does the explanation address the specific context or scenario of the question?
            
            User Specific:
            Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
            Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?
            
            Provides Pluralism:
            Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
            Score: (0-10) How well does the explanation provide or support multiple perspectives?
        
            After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
     
            Example JSON format:
            {{
            "Factually Correct": {{
                "Justification": "xxx",
                "Score": 9
            }},
            "Useful": {{
                "Justification": "xxx",
                "Score": 8.5
            }},
            "Context Specific": {{
                "Justification": "xxx",
                "Score": 8
            }},
            "User Specific": {{
                "Justification": "xxx",
                "Score": 7.5
            }},
            "Provides Pluralism": {{
                "Justification": "xxx",
                "Score": 7
            }}
        }}
        
        Answer:
        """

        print(evaluation_prompt)

        response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=1000).strip()
        try:
            scores = json.loads(response)
        except json.JSONDecodeError:
            repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
            try:
                scores = json.loads(repaired_json)
            except json.JSONDecodeError:
                print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
                return {key: -1 for key in ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]}

        return self.validate_scores(scores)


def write_evaluation_commentary(scores):
    evaluation_details = []

    for principle, details in scores.items():
        print(details)
        score = details.get('Score', -1)
        justification = details.get('Justification', '')

        if score == -1:
            evaluation_details.append(
                {'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
                 'Justification': justification})
            continue

        if principle == "Factually Correct":
            if score >= 0.8:
                comment = "Excellent accuracy! The information is precise and directly relevant to the question."
            elif score >= 0.5:
                comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
            else:
                comment = "The explanation contains significant inaccuracies or irrelevant information."
        elif principle == "Useful":
            if score >= 0.8:
                comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
            elif score >= 0.5:
                comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
            else:
                comment = "The explanation does little to help understand or apply the information provided."
        elif principle == "Context Specific":
            if score >= 0.8:
                comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
            elif score >= 0.5:
                comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
            else:
                comment = "Fails to address the context of the question, lacking relevance or specificity."
        elif principle == "User Specific":
            if score >= 0.8:
                comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
            elif score >= 0.5:
                comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
            else:
                comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
        elif principle == "Provides Pluralism":
            if score >= 0.8:
                comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
            elif score >= 0.5:
                comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
            else:
                comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."

        evaluation_details.append(
            {'Principle': principle, 'Score': score, 'Justification': justification,'Commentary': comment})

    return evaluation_details
# def write_evaluation_commentary(scores):
#     evaluation_details = []
#     for principle, score in scores.items():
#
#         if score == -1:
#             evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
#             continue
#
#         if principle == "Factually Correct":
#             if score >= 0.8:
#                 comment = "Excellent accuracy! The information is precise and directly relevant to the question."
#             elif score >= 0.5:
#                 comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
#             else:
#                 comment = "The explanation contains significant inaccuracies or irrelevant information."
#         elif principle == "Useful":
#             if score >= 0.8:
#                 comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
#             elif score >= 0.5:
#                 comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
#             else:
#                 comment = "The explanation does little to help understand or apply the information provided."
#         elif principle == "Context Specific":
#             if score >= 0.8:
#                 comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
#             elif score >= 0.5:
#                 comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
#             else:
#                 comment = "Fails to address the context of the question, lacking relevance or specificity."
#         elif principle == "User Specific":
#             if score >= 0.8:
#                 comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
#             elif score >= 0.5:
#                 comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
#             else:
#                 comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
#         elif principle == "Provides Pluralism":
#             if score >= 0.8:
#                 comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
#             elif score >= 0.5:
#                 comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
#             else:
#                 comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
#
#         evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
#     return evaluation_details

if __name__ == '__main__':

    eval = evaluator()
    conversation = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who won the world series in 2020?"},
        {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
        {"role": "user", "content": "Where was it played?"}
    ]
    context = "general user, user_background is sports enthusiast"
    results = eval.evaluate_conversation(conversation, context)
    print(results)
    print(write_evaluation_commentary(results))