from openai import OpenAI from answer import ask_question import json EVAL_PATH = r'C:\AppyProjects\CodeBase RAG bot\codebase_evaluation_dataset.json' EVALUATION_MODEL = "gpt-4o-mini" openai = OpenAI() #load test dataset def load_testset(): with open(EVAL_PATH, 'r', encoding ='utf-8') as e : dataset = json.load(e) print(f'len of test dataset is - {len(dataset)} ' ) return dataset #evaluatior LLM def evaluate_chatbot(eval_dataset : list): scores = [] for data in eval_dataset: question = data['question'] expected = data['expected_answer'] actual, chunks = ask_question(data['question']) judge_prompt = f""" YOU ARE THE JUDGE FOR A RAG BASED MEDICAL BOT. YOU ARE PROVIDED WITH A QUESTION, A PREDEFINED/EXPECTED ANSWER AND A LLM GENERATRED ANSWER. Question: {question} Expected Answer: {expected} Generated Answer: {actual} Score from 0-5. 5 = fully correct 4 = mostly correct 3 = partially correct 2 = mostly wrong 1 = wrong 0 = hallucinated Return only the score. """ eval_res = openai.chat.completions.create( model=EVALUATION_MODEL, messages=[{'role' : 'system' , 'content' : judge_prompt}] ) score = eval_res.choices[0].message.content print(f'score of this question - {eval_res.choices[0].message.content}') scores.append(int(score)) print(f'new scores - {scores}') print(f'final scores are - {scores}') print(f'Average score is - {sum(scores)/len(scores)}') if __name__ == '__main__' : testset = load_testset() evaluate_chatbot(testset)