Spaces:
Sleeping
Sleeping
import gradio as gr | |
import re | |
import os | |
from docx import Document | |
from docx.shared import Inches | |
from ragas import evaluate, EvaluationDataset | |
from ragas.metrics import Faithfulness, FactualCorrectness, SemanticSimilarity, ResponseGroundedness, AnswerAccuracy | |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings | |
from ragas.llms import LangchainLLMWrapper | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# Initialize OpenAI API | |
llm = ChatOpenAI(model="gpt-4.1") | |
embeddings = OpenAIEmbeddings() | |
evaluator_llm = LangchainLLMWrapper(llm) | |
# ----- Helper Functions ----- | |
def read_docx(file): | |
"""Extract text from Word document.""" | |
doc = Document(file) | |
return "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) | |
def clean_retrieved_context(raw_context): | |
# Remove unnecessary line breaks within paragraphs | |
cleaned = raw_context.replace("-\n", "").replace("\n", " ") | |
# Remove extra spaces clearly | |
cleaned = re.sub(r'\s+', ' ', cleaned) | |
# Return explicitly cleaned context | |
return cleaned.strip() | |
def format_ragas_results(ragas_results): | |
return [ | |
{metric: f"{score*100:.2f}%" for metric, score in sample_scores.items()} | |
for sample_scores in ragas_results.scores | |
] | |
def plot_radar_chart(ragas_results): | |
# Extract first sample if multiple scores | |
if isinstance(ragas_results.scores, list): | |
scores_dict = ragas_results.scores[0] # first sample | |
else: | |
scores_dict = ragas_results.scores | |
labels = list(scores_dict.keys()) | |
values = list(scores_dict.values()) | |
# Close the loop for radar chart | |
values += values[:1] | |
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist() | |
angles += angles[:1] | |
fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True)) | |
ax.fill(angles, values, color='skyblue', alpha=0.4) | |
ax.plot(angles, values, color='blue', linewidth=2) | |
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0]) | |
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0']) | |
ax.set_xticks(angles[:-1]) | |
ax.set_xticklabels(labels) | |
ax.set_title('RAGAS Metrics Radar Chart', size=14, y=1.05) | |
plt.tight_layout() | |
chart_path = "radar_chart.png" | |
fig.savefig(chart_path) | |
plt.close(fig) | |
return chart_path | |
def interpret_ragas_results_with_gpt(formatted_scores: list, llm) -> str: | |
if not formatted_scores or not isinstance(formatted_scores[0], dict): | |
return "Invalid RAGAS scores provided." | |
score_text = "\n".join([f"{k}: {v}" for k, v in formatted_scores[0].items()]) | |
prompt = f""" | |
You are an expert in RAGAS evaluation metrics to evaluate AI-generated content. | |
The following RAGAS evaluation scores are from a comparison between an AI-generated scientific case development document (SCDD) and a human-written version. This evaluation is conducted in the context of exploratory and novel scientific use cases. The AI-generated document may include new ideas, restructured concepts, or facts not explicitly mentioned in the human reference. | |
Interpret these scores with a **balanced and critical lens**: | |
- Acknowledge that the AI output may contain exploratory and novel content. | |
- However, evaluate the scores in light of both strengths and **potential weaknesses** or limitations. | |
- Consider how novelty, phrasing differences, or omissions might impact factual and alignment-based metrics. | |
- Cover both aspects of novelty. That is, new insights as well as inaccuracies. | |
- Do **not** start with phrases like "Certainly" or "Here's..." β **begin directly with the interpretation**. | |
RAGAS Scores: | |
{score_text} | |
Provide a short paragraph interpretation for each metric. | |
""" | |
response = llm.invoke(prompt) | |
return response.content.strip() | |
def generate_word_report(science_goal, ragas_results, radar_chart_path, interpretation): | |
doc = Document() | |
doc.add_heading("SCDD Evaluation Report", 0) | |
doc.add_heading("Science Goal", level=1) | |
doc.add_paragraph(science_goal) | |
doc.add_heading("RAGAS SCDD Evaluation Scores", level=1) | |
for metric, score in ragas_results.scores[0].items(): | |
doc.add_paragraph(f"{metric}: {score*100:.2f}%") | |
doc.add_heading("RAGAS Metrics Chart", level=1) | |
doc.add_picture(radar_chart_path, width=Inches(5)) | |
doc.add_heading("GPT-4.1 Interpretation of RAGAS AI-SCDD Evaluation", level=1) | |
doc.add_paragraph(interpretation) | |
output_path = "SCDD_Evaluation_Report.docx" | |
doc.save(output_path) | |
return output_path | |
def evaluate_scdd(ai_scdd_file, human_scdd_file, user_input): | |
# Read uploaded documents | |
ai_scdd_text = read_docx(ai_scdd_file) | |
human_scdd_text = read_docx(human_scdd_file) | |
context = [clean_retrieved_context(chunk) for chunk in human_scdd_text] | |
# Create RAGAS evaluation dataset | |
dataset = [{ | |
"user_input": user_input if user_input else "N/A", | |
"retrieved_contexts": context, | |
"response": ai_scdd_text, | |
"reference": human_scdd_text | |
}] | |
evaluation_dataset = EvaluationDataset.from_list(dataset) | |
# Define metrics | |
metrics = [ | |
SemanticSimilarity(), | |
ResponseGroundedness(), | |
Faithfulness(), | |
AnswerAccuracy(), | |
FactualCorrectness(coverage="low", atomicity="low") | |
] | |
# Run RAGAS evaluation | |
ragas_result = evaluate( | |
dataset=evaluation_dataset, | |
metrics=metrics, | |
llm=evaluator_llm, | |
embeddings=embeddings | |
) | |
# RAGAS metrics outputs | |
formatted_scores = format_ragas_results(ragas_result) | |
radar_chart_path = plot_radar_chart(ragas_result) | |
interpretation = interpret_ragas_results_with_gpt(formatted_scores, llm) | |
word_report_path = generate_word_report(user_input, ragas_result, radar_chart_path, interpretation) | |
return formatted_scores, radar_chart_path, interpretation, word_report_path | |
# ----- Gradio Interface ----- | |
interface = gr.Interface( | |
fn=evaluate_scdd, | |
inputs=[ | |
gr.File(label="Upload AI-Generated SCDD (Word .docx)", type='filepath'), | |
gr.File(label="Upload Human-Generated SCDD (Word .docx)", type='filepath'), | |
gr.Textbox(label="Science Goal", placeholder="Enter science goal here..."), | |
], | |
outputs=[ | |
gr.JSON(label="RAGAS Evaluation Scores"), | |
gr.Image(label="RAGAS Metrics Radar Chart"), | |
gr.Textbox(label="GPT-4.1 Interpretation of RAGAS Results"), | |
gr.File(label="Download Word Report") | |
], | |
title="RAGAS Evaluation: AI vs Human SCDD", | |
description="Compare AI-generated and human-generated science case documents using RAGAS LLM-powered metrics" | |
) | |
if __name__ == "__main__": | |
interface.launch() | |