Spaces:
Running
on
Zero
Running
on
Zero
import sys | |
import os | |
import csv | |
from main import RunChain, RunGraph | |
from retriever import BuildRetriever | |
from ragas import EvaluationDataset, evaluate | |
from ragas.llms import LangchainLLMWrapper | |
from ragas.metrics import ( | |
AnswerAccuracy, | |
ContextRelevance, | |
ResponseGroundedness, | |
) | |
from langchain_openai import ChatOpenAI | |
import argparse | |
import logging | |
import traceback | |
# Suppress these messages: | |
# INFO:openai._base_client:Retrying request to /chat/completions in ___ seconds | |
# https://community.openai.com/t/suppress-http-request-post-message/583334/8 | |
openai_logger = logging.getLogger("openai") | |
openai_logger.setLevel(logging.WARNING) | |
def load_questions_and_references(csv_path): | |
"""Read questions and references from CSV""" | |
questions = [] | |
references = [] | |
with open(csv_path, newline="") as csvfile: | |
reader = csv.DictReader(csvfile) | |
for row in reader: | |
questions.append(row["question"].strip('"')) | |
references.append(row["reference"].strip('"')) | |
return questions, references | |
def build_eval_dataset(questions, references, compute_mode, workflow, search_type): | |
"""Build dataset for evaluation""" | |
dataset = [] | |
for question, reference in zip(questions, references): | |
try: | |
if workflow == "chain": | |
print("\n\n--- Question ---") | |
print(question) | |
response = RunChain(question, compute_mode, search_type) | |
print("--- Response ---") | |
print(response) | |
# Retrieve context documents for a question | |
retriever = BuildRetriever(compute_mode, search_type) | |
docs = retriever.invoke(question) | |
retrieved_contexts = [doc.page_content for doc in docs] | |
if workflow == "graph": | |
result = RunGraph(question, compute_mode, search_type) | |
retrieved_contexts = [] | |
if "retrieved_emails" in result: | |
# Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator | |
retrieved_contexts = [ | |
"\n\n\nFrom" + email.split("\n\n\nFrom")[1] | |
for email in result["retrieved_emails"] | |
] | |
response = result["answer"] | |
dataset.append( | |
{ | |
"user_input": question, | |
"retrieved_contexts": retrieved_contexts, | |
"response": response, | |
"reference": reference, | |
} | |
) | |
except: | |
print( | |
f"--- Question omitted from evals due to failed generation: {question} ---" | |
) | |
print(traceback.format_exc()) | |
return dataset | |
def run_evals_with_csv(csv_path): | |
"""Run evals using saved responses in a CSV file""" | |
# Load an evaluation dataset from saved responses in a CSV file | |
csv_questions = [] | |
retrieved_emails = [] | |
answers = [] | |
with open(csv_path, newline="") as csvfile: | |
reader = csv.DictReader(csvfile) | |
for row in reader: | |
csv_questions.append(row["question"].strip('"')) | |
retrieved_emails.append(row["retrieved_emails"].strip('"')) | |
answers.append(row["answer"].strip('"')) | |
questions, references = load_questions_and_references("eval.csv") | |
# Make sure the questions are the same | |
assert csv_questions == questions | |
# Build dataset for evaluation | |
dataset = [] | |
for question, reference, retrieved_email, answer in zip( | |
questions, references, retrieved_emails, answers | |
): | |
# Remove the source file names (e.g. R-help/2022-September.txt) as it confuses the evaluator | |
retrieved_contexts = ( | |
[ | |
"\n\n\nFrom" + email.split("\n\n\nFrom")[1] | |
for email in retrieved_email.split( | |
"\n\n--- --- --- --- Next Email --- --- --- ---\n\n" | |
) | |
] | |
if retrieved_email != "" | |
else [] | |
) | |
dataset.append( | |
{ | |
"user_input": question, | |
"retrieved_contexts": retrieved_contexts, | |
"response": answer, | |
"reference": reference, | |
} | |
) | |
evaluation_dataset = EvaluationDataset.from_list(dataset) | |
# Set up LLM for evaluation | |
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) | |
evaluator_llm = LangchainLLMWrapper(llm) | |
# Evaluate | |
result = evaluate( | |
dataset=evaluation_dataset, | |
# NVIDIA metrics | |
metrics=[ContextRelevance(), ResponseGroundedness(), AnswerAccuracy()], | |
llm=evaluator_llm, | |
) | |
print("Evaluation Results:") | |
print(result) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Evaluate RAG retrieval and generation." | |
) | |
parser.add_argument( | |
"--compute_mode", | |
choices=["remote", "local"], | |
required=True, | |
help="Compute mode: remote or local.", | |
) | |
parser.add_argument( | |
"--workflow", | |
choices=["chain", "graph"], | |
required=True, | |
help="Workflow: chain or graph.", | |
) | |
parser.add_argument( | |
"--search_type", | |
choices=["dense", "sparse", "hybrid"], | |
required=True, | |
help="Search type: dense, sparse, or hybrid.", | |
) | |
args = parser.parse_args() | |
compute_mode = args.compute_mode | |
workflow = args.workflow | |
search_type = args.search_type | |
questions, references = load_questions_and_references("eval.csv") | |
dataset = build_eval_dataset( | |
questions, references, compute_mode, workflow, search_type | |
) | |
evaluation_dataset = EvaluationDataset.from_list(dataset) | |
# Set up LLM for evaluation | |
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) | |
evaluator_llm = LangchainLLMWrapper(llm) | |
# Evaluate | |
result = evaluate( | |
dataset=evaluation_dataset, | |
# NVIDIA metrics | |
metrics=[ContextRelevance(), ResponseGroundedness(), AnswerAccuracy()], | |
llm=evaluator_llm, | |
) | |
print("Evaluation Results:") | |
print(result) | |
if __name__ == "__main__": | |
main() | |