codebase-RAG / eval.py
arpitt007's picture
Upload folder using huggingface_hub
ce6b98e verified
from openai import OpenAI
from answer import ask_question
import json
EVAL_PATH = r'C:\AppyProjects\CodeBase RAG bot\codebase_evaluation_dataset.json'
EVALUATION_MODEL = "gpt-4o-mini"
openai = OpenAI()
#load test dataset
def load_testset():
with open(EVAL_PATH, 'r', encoding ='utf-8') as e :
dataset = json.load(e)
print(f'len of test dataset is - {len(dataset)} ' )
return dataset
#evaluatior LLM
def evaluate_chatbot(eval_dataset : list):
scores = []
for data in eval_dataset:
question = data['question']
expected = data['expected_answer']
actual, chunks = ask_question(data['question'])
judge_prompt = f""" YOU ARE THE JUDGE FOR A RAG BASED MEDICAL BOT.
YOU ARE PROVIDED WITH A QUESTION, A PREDEFINED/EXPECTED ANSWER AND A LLM GENERATRED ANSWER.
Question: {question}
Expected Answer:
{expected}
Generated Answer:
{actual}
Score from 0-5.
5 = fully correct
4 = mostly correct
3 = partially correct
2 = mostly wrong
1 = wrong
0 = hallucinated
Return only the score.
"""
eval_res = openai.chat.completions.create(
model=EVALUATION_MODEL,
messages=[{'role' : 'system' , 'content' : judge_prompt}]
)
score = eval_res.choices[0].message.content
print(f'score of this question - {eval_res.choices[0].message.content}')
scores.append(int(score))
print(f'new scores - {scores}')
print(f'final scores are - {scores}')
print(f'Average score is - {sum(scores)/len(scores)}')
if __name__ == '__main__' :
testset = load_testset()
evaluate_chatbot(testset)