Spaces:
Sleeping
Sleeping
| from openai import OpenAI | |
| from answer import ask_question | |
| import json | |
| EVAL_PATH = r'C:\AppyProjects\CodeBase RAG bot\codebase_evaluation_dataset.json' | |
| EVALUATION_MODEL = "gpt-4o-mini" | |
| openai = OpenAI() | |
| #load test dataset | |
| def load_testset(): | |
| with open(EVAL_PATH, 'r', encoding ='utf-8') as e : | |
| dataset = json.load(e) | |
| print(f'len of test dataset is - {len(dataset)} ' ) | |
| return dataset | |
| #evaluatior LLM | |
| def evaluate_chatbot(eval_dataset : list): | |
| scores = [] | |
| for data in eval_dataset: | |
| question = data['question'] | |
| expected = data['expected_answer'] | |
| actual, chunks = ask_question(data['question']) | |
| judge_prompt = f""" YOU ARE THE JUDGE FOR A RAG BASED MEDICAL BOT. | |
| YOU ARE PROVIDED WITH A QUESTION, A PREDEFINED/EXPECTED ANSWER AND A LLM GENERATRED ANSWER. | |
| Question: {question} | |
| Expected Answer: | |
| {expected} | |
| Generated Answer: | |
| {actual} | |
| Score from 0-5. | |
| 5 = fully correct | |
| 4 = mostly correct | |
| 3 = partially correct | |
| 2 = mostly wrong | |
| 1 = wrong | |
| 0 = hallucinated | |
| Return only the score. | |
| """ | |
| eval_res = openai.chat.completions.create( | |
| model=EVALUATION_MODEL, | |
| messages=[{'role' : 'system' , 'content' : judge_prompt}] | |
| ) | |
| score = eval_res.choices[0].message.content | |
| print(f'score of this question - {eval_res.choices[0].message.content}') | |
| scores.append(int(score)) | |
| print(f'new scores - {scores}') | |
| print(f'final scores are - {scores}') | |
| print(f'Average score is - {sum(scores)/len(scores)}') | |
| if __name__ == '__main__' : | |
| testset = load_testset() | |
| evaluate_chatbot(testset) | |