from openai import OpenAI
from answer import ask_question
import json


EVAL_PATH = r'C:\AppyProjects\CodeBase RAG bot\codebase_evaluation_dataset.json'
EVALUATION_MODEL = "gpt-4o-mini"
openai = OpenAI()

#load test dataset
def load_testset():
    with open(EVAL_PATH, 'r', encoding ='utf-8') as e :
        dataset = json.load(e)
    print(f'len of test dataset is - {len(dataset)} ' )
    return dataset
    
    
#evaluatior LLM
def evaluate_chatbot(eval_dataset : list):
    scores = []
    for data in eval_dataset:
        
        question = data['question']
        expected = data['expected_answer']
        actual, chunks = ask_question(data['question'])
        
        judge_prompt = f""" YOU ARE THE JUDGE FOR A RAG BASED MEDICAL BOT.
            YOU ARE PROVIDED WITH A QUESTION, A PREDEFINED/EXPECTED ANSWER AND A LLM GENERATRED ANSWER.
            Question: {question}

            Expected Answer:
            {expected}

            Generated Answer:
            {actual}

            Score from 0-5.

            5 = fully correct
            4 = mostly correct
            3 = partially correct
            2 = mostly wrong
            1 = wrong
            0 = hallucinated

            Return only the score.
            """
        eval_res = openai.chat.completions.create(
            model=EVALUATION_MODEL,
            messages=[{'role' : 'system' , 'content' : judge_prompt}]
        )
        score = eval_res.choices[0].message.content
        print(f'score of this question - {eval_res.choices[0].message.content}')
        scores.append(int(score))
        print(f'new scores - {scores}')
        
        
    print(f'final scores are - {scores}')
    print(f'Average score is - {sum(scores)/len(scores)}')
    

if __name__ == '__main__' :
    testset = load_testset()
    evaluate_chatbot(testset)