Spaces:

arpitt007
/

codebase-RAG

Sleeping

Upload folder using huggingface_hub

ce6b98e verified 3 days ago

1.9 kB

	from openai import OpenAI
	from answer import ask_question
	import json


	EVAL_PATH = r'C:\AppyProjects\CodeBase RAG bot\codebase_evaluation_dataset.json'
	EVALUATION_MODEL = "gpt-4o-mini"
	openai = OpenAI()

	#load test dataset
	def load_testset():
	with open(EVAL_PATH, 'r', encoding ='utf-8') as e :
	dataset = json.load(e)
	print(f'len of test dataset is - {len(dataset)} ' )
	return dataset


	#evaluatior LLM
	def evaluate_chatbot(eval_dataset : list):
	scores = []
	for data in eval_dataset:

	question = data['question']
	expected = data['expected_answer']
	actual, chunks = ask_question(data['question'])

	judge_prompt = f""" YOU ARE THE JUDGE FOR A RAG BASED MEDICAL BOT.
	YOU ARE PROVIDED WITH A QUESTION, A PREDEFINED/EXPECTED ANSWER AND A LLM GENERATRED ANSWER.
	Question: {question}

	Expected Answer:
	{expected}

	Generated Answer:
	{actual}

	Score from 0-5.

	5 = fully correct
	4 = mostly correct
	3 = partially correct
	2 = mostly wrong
	1 = wrong
	0 = hallucinated

	Return only the score.
	"""
	eval_res = openai.chat.completions.create(
	model=EVALUATION_MODEL,
	messages=[{'role' : 'system' , 'content' : judge_prompt}]
	)
	score = eval_res.choices[0].message.content
	print(f'score of this question - {eval_res.choices[0].message.content}')
	scores.append(int(score))
	print(f'new scores - {scores}')


	print(f'final scores are - {scores}')
	print(f'Average score is - {sum(scores)/len(scores)}')


	if __name__ == '__main__' :
	testset = load_testset()
	evaluate_chatbot(testset)