Spaces:

puzan789
/

portlae

Sleeping

App Files Files Community

portlae / ragevaluation /evaluate.py

puzan789

aAded

2214088 4 months ago

raw

history blame contribute delete

3.7 kB

	import asyncio
	import csv
	from langchain_groq import ChatGroq
	from pydantic import BaseModel
	from src.pipeline import QAPipeline
	from src.settings import settings


	class LLMResponse(BaseModel):
	is_correct: bool
	is_idk: bool


	pipeline = QAPipeline()

	llm = ChatGroq(
	model="llama-3.3-70b-versatile",
	max_tokens=512,
	max_retries=2,
	timeout=30000,
	api_key=settings.GROQ_API_KEY,
	)

	# Bind structured evaluation output
	llm_evaluator = llm.with_structured_output(LLMResponse, include_raw=True)

	# Sample queries and ground truths
	queries = [
	"What are the ingredients of tomato soup?",
	"Recipe for an apple pie",
	"How do you make a chocolate cake?",
	"How do you make dal bhat?",
	"What are the ingredients for making dhido?",
	"How do you make chana masala?",
	"Recipe for butter chicken",
	"Recipe for ramen",
	]

	ground_truths = [
	"Tomato Soup: tomatoes, onion, garlic, vegetable broth, cream, salt, pepper",
	"Apple Pie: apples, flour, butter, sugar, cinnamon",
	"I don't know",
	"I don't know",
	"I don't know",
	"Chana Masala: chickpeas, tomato, onion, garlic, ginger, spices",
	"Butter Chicken: chicken, tomato puree, cream, butter, spices, garlic, ginger",
	"Ramen: ramen noodles, broth, boiled egg, green onion, soy sauce",
	]

	# Evaluation function
	async def evaluate_pipeline():
	correct = 0
	idk = 0
	total = len(queries)
	results = []

	for q, gt in zip(queries, ground_truths):
	response = await pipeline.answer_query_(q)
	answer = response.answer

	# Evaluation prompt
	eval_prompt = f"""
	You are an evaluator. Assess whether the model's answer is both factually correct and acknowledges lack of knowledge when necessary.

	Question: {q}
	Model's Answer: {answer}
	Ground Truth: {gt}

	Evaluate the following:
	1. Is the model's answer semantically correct when compared to the ground truth?
	2. Does the model appropriately say "I don't know" or avoid answering if the answer is not available?

	Respond in JSON with two fields:
	- is_correct: true or false
	- is_idk: true or false
	"""

	result = llm_evaluator.invoke(eval_prompt)
	parsed = result["parsed"]

	# Correct = either factually correct or correctly says "I don't know" when GT also says so
	if parsed.is_correct or (parsed.is_idk and gt.strip().lower() == "i don't know"):
	correct += 1
	if parsed.is_idk:
	idk += 1

	# Log and store results
	print(
	f"Q: {q}\nA: {answer}\nGT: {gt}\nCorrect: {parsed.is_correct}, IDK: {parsed.is_idk}\n{'-' * 60}"
	)

	results.append(
	{
	"question": q,
	"model_answer": answer,
	"ground_truth": gt,
	"is_correct": parsed.is_correct,
	"is_idk": parsed.is_idk,
	}
	)

	# Save results to CSV
	with open("evaluation_results.csv", "w", newline="", encoding="utf-8") as csvfile:
	writer = csv.DictWriter(
	csvfile,
	fieldnames=[
	"question",
	"model_answer",
	"ground_truth",
	"is_correct",
	"is_idk",
	],
	)
	writer.writeheader()
	writer.writerows(results)

	# Print summary
	print(f"\nEvaluation results saved to 'evaluation_results.csv'.")
	print(f"Total Correct: {correct}/{total} ({(correct / total) * 100:.2f}%)")
	print(f"'I don't know' Responses: {idk}/{3} ({(idk / 3) * 100:.2f}%)") #here 3 because there a re 3 total i dont know response

	# Entry point
	if __name__ == "__main__":
	asyncio.run(evaluate_pipeline())