aquibmoin's picture
Update app.py
10d64ed verified
import gradio as gr
import re
import os
from docx import Document
from docx.shared import Inches
from ragas import evaluate, EvaluationDataset
from ragas.metrics import Faithfulness, FactualCorrectness, SemanticSimilarity, ResponseGroundedness, AnswerAccuracy
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
import matplotlib.pyplot as plt
import numpy as np
# Initialize OpenAI API
llm = ChatOpenAI(model="gpt-4.1")
embeddings = OpenAIEmbeddings()
evaluator_llm = LangchainLLMWrapper(llm)
# ----- Helper Functions -----
def read_docx(file):
"""Extract text from Word document."""
doc = Document(file)
return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
def clean_retrieved_context(raw_context):
# Remove unnecessary line breaks within paragraphs
cleaned = raw_context.replace("-\n", "").replace("\n", " ")
# Remove extra spaces clearly
cleaned = re.sub(r'\s+', ' ', cleaned)
# Return explicitly cleaned context
return cleaned.strip()
def format_ragas_results(ragas_results):
return [
{metric: f"{score*100:.2f}%" for metric, score in sample_scores.items()}
for sample_scores in ragas_results.scores
]
def plot_radar_chart(ragas_results):
# Extract first sample if multiple scores
if isinstance(ragas_results.scores, list):
scores_dict = ragas_results.scores[0] # first sample
else:
scores_dict = ragas_results.scores
labels = list(scores_dict.keys())
values = list(scores_dict.values())
# Close the loop for radar chart
values += values[:1]
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
ax.fill(angles, values, color='skyblue', alpha=0.4)
ax.plot(angles, values, color='blue', linewidth=2)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
ax.set_title('RAGAS Metrics Radar Chart', size=14, y=1.05)
plt.tight_layout()
chart_path = "radar_chart.png"
fig.savefig(chart_path)
plt.close(fig)
return chart_path
def interpret_ragas_results_with_gpt(formatted_scores: list, llm) -> str:
if not formatted_scores or not isinstance(formatted_scores[0], dict):
return "Invalid RAGAS scores provided."
score_text = "\n".join([f"{k}: {v}" for k, v in formatted_scores[0].items()])
prompt = f"""
You are an expert in RAGAS evaluation metrics to evaluate AI-generated content.
The following RAGAS evaluation scores are from a comparison between an AI-generated scientific case development document (SCDD) and a human-written version. This evaluation is conducted in the context of exploratory and novel scientific use cases. The AI-generated document may include new ideas, restructured concepts, or facts not explicitly mentioned in the human reference.
Interpret these scores with a **balanced and critical lens**:
- Acknowledge that the AI output may contain exploratory and novel content.
- However, evaluate the scores in light of both strengths and **potential weaknesses** or limitations.
- Consider how novelty, phrasing differences, or omissions might impact factual and alignment-based metrics.
- Cover both aspects of novelty. That is, new insights as well as inaccuracies.
- Do **not** start with phrases like "Certainly" or "Here's..." β€” **begin directly with the interpretation**.
RAGAS Scores:
{score_text}
Provide a short paragraph interpretation for each metric.
"""
response = llm.invoke(prompt)
return response.content.strip()
def generate_word_report(science_goal, ragas_results, radar_chart_path, interpretation):
doc = Document()
doc.add_heading("SCDD Evaluation Report", 0)
doc.add_heading("Science Goal", level=1)
doc.add_paragraph(science_goal)
doc.add_heading("RAGAS SCDD Evaluation Scores", level=1)
for metric, score in ragas_results.scores[0].items():
doc.add_paragraph(f"{metric}: {score*100:.2f}%")
doc.add_heading("RAGAS Metrics Chart", level=1)
doc.add_picture(radar_chart_path, width=Inches(5))
doc.add_heading("GPT-4.1 Interpretation of RAGAS AI-SCDD Evaluation", level=1)
doc.add_paragraph(interpretation)
output_path = "SCDD_Evaluation_Report.docx"
doc.save(output_path)
return output_path
def evaluate_scdd(ai_scdd_file, human_scdd_file, user_input):
# Read uploaded documents
ai_scdd_text = read_docx(ai_scdd_file)
human_scdd_text = read_docx(human_scdd_file)
context = [clean_retrieved_context(chunk) for chunk in human_scdd_text]
# Create RAGAS evaluation dataset
dataset = [{
"user_input": user_input if user_input else "N/A",
"retrieved_contexts": context,
"response": ai_scdd_text,
"reference": human_scdd_text
}]
evaluation_dataset = EvaluationDataset.from_list(dataset)
# Define metrics
metrics = [
SemanticSimilarity(),
ResponseGroundedness(),
Faithfulness(),
AnswerAccuracy(),
FactualCorrectness(coverage="low", atomicity="low")
]
# Run RAGAS evaluation
ragas_result = evaluate(
dataset=evaluation_dataset,
metrics=metrics,
llm=evaluator_llm,
embeddings=embeddings
)
# RAGAS metrics outputs
formatted_scores = format_ragas_results(ragas_result)
radar_chart_path = plot_radar_chart(ragas_result)
interpretation = interpret_ragas_results_with_gpt(formatted_scores, llm)
word_report_path = generate_word_report(user_input, ragas_result, radar_chart_path, interpretation)
return formatted_scores, radar_chart_path, interpretation, word_report_path
# ----- Gradio Interface -----
interface = gr.Interface(
fn=evaluate_scdd,
inputs=[
gr.File(label="Upload AI-Generated SCDD (Word .docx)", type='filepath'),
gr.File(label="Upload Human-Generated SCDD (Word .docx)", type='filepath'),
gr.Textbox(label="Science Goal", placeholder="Enter science goal here..."),
],
outputs=[
gr.JSON(label="RAGAS Evaluation Scores"),
gr.Image(label="RAGAS Metrics Radar Chart"),
gr.Textbox(label="GPT-4.1 Interpretation of RAGAS Results"),
gr.File(label="Download Word Report")
],
title="RAGAS Evaluation: AI vs Human SCDD",
description="Compare AI-generated and human-generated science case documents using RAGAS LLM-powered metrics"
)
if __name__ == "__main__":
interface.launch()