|
import asyncio |
|
import csv |
|
from langchain_groq import ChatGroq |
|
from pydantic import BaseModel |
|
from src.pipeline import QAPipeline |
|
from src.settings import settings |
|
|
|
|
|
class LLMResponse(BaseModel): |
|
is_correct: bool |
|
is_idk: bool |
|
|
|
|
|
pipeline = QAPipeline() |
|
|
|
llm = ChatGroq( |
|
model="llama-3.3-70b-versatile", |
|
max_tokens=512, |
|
max_retries=2, |
|
timeout=30000, |
|
api_key=settings.GROQ_API_KEY, |
|
) |
|
|
|
|
|
llm_evaluator = llm.with_structured_output(LLMResponse, include_raw=True) |
|
|
|
|
|
queries = [ |
|
"What are the ingredients of tomato soup?", |
|
"Recipe for an apple pie", |
|
"How do you make a chocolate cake?", |
|
"How do you make dal bhat?", |
|
"What are the ingredients for making dhido?", |
|
"How do you make chana masala?", |
|
"Recipe for butter chicken", |
|
"Recipe for ramen", |
|
] |
|
|
|
ground_truths = [ |
|
"Tomato Soup: tomatoes, onion, garlic, vegetable broth, cream, salt, pepper", |
|
"Apple Pie: apples, flour, butter, sugar, cinnamon", |
|
"I don't know", |
|
"I don't know", |
|
"I don't know", |
|
"Chana Masala: chickpeas, tomato, onion, garlic, ginger, spices", |
|
"Butter Chicken: chicken, tomato puree, cream, butter, spices, garlic, ginger", |
|
"Ramen: ramen noodles, broth, boiled egg, green onion, soy sauce", |
|
] |
|
|
|
|
|
async def evaluate_pipeline(): |
|
correct = 0 |
|
idk = 0 |
|
total = len(queries) |
|
results = [] |
|
|
|
for q, gt in zip(queries, ground_truths): |
|
response = await pipeline.answer_query_(q) |
|
answer = response.answer |
|
|
|
|
|
eval_prompt = f""" |
|
You are an evaluator. Assess whether the model's answer is both factually correct and acknowledges lack of knowledge when necessary. |
|
|
|
Question: {q} |
|
Model's Answer: {answer} |
|
Ground Truth: {gt} |
|
|
|
Evaluate the following: |
|
1. Is the model's answer semantically correct when compared to the ground truth? |
|
2. Does the model appropriately say "I don't know" or avoid answering if the answer is not available? |
|
|
|
Respond in JSON with two fields: |
|
- is_correct: true or false |
|
- is_idk: true or false |
|
""" |
|
|
|
result = llm_evaluator.invoke(eval_prompt) |
|
parsed = result["parsed"] |
|
|
|
|
|
if parsed.is_correct or (parsed.is_idk and gt.strip().lower() == "i don't know"): |
|
correct += 1 |
|
if parsed.is_idk: |
|
idk += 1 |
|
|
|
|
|
print( |
|
f"Q: {q}\nA: {answer}\nGT: {gt}\nCorrect: {parsed.is_correct}, IDK: {parsed.is_idk}\n{'-' * 60}" |
|
) |
|
|
|
results.append( |
|
{ |
|
"question": q, |
|
"model_answer": answer, |
|
"ground_truth": gt, |
|
"is_correct": parsed.is_correct, |
|
"is_idk": parsed.is_idk, |
|
} |
|
) |
|
|
|
|
|
with open("evaluation_results.csv", "w", newline="", encoding="utf-8") as csvfile: |
|
writer = csv.DictWriter( |
|
csvfile, |
|
fieldnames=[ |
|
"question", |
|
"model_answer", |
|
"ground_truth", |
|
"is_correct", |
|
"is_idk", |
|
], |
|
) |
|
writer.writeheader() |
|
writer.writerows(results) |
|
|
|
|
|
print(f"\nEvaluation results saved to 'evaluation_results.csv'.") |
|
print(f"Total Correct: {correct}/{total} ({(correct / total) * 100:.2f}%)") |
|
print(f"'I don't know' Responses: {idk}/{3} ({(idk / 3) * 100:.2f}%)") |
|
|
|
|
|
if __name__ == "__main__": |
|
asyncio.run(evaluate_pipeline()) |
|
|