Spaces:
Sleeping
Sleeping
File size: 6,683 Bytes
1e90ba6 dbf61a8 9e4cc08 1e90ba6 9e4cc08 1e90ba6 59b8843 1e90ba6 c8c03be 1e90ba6 dbf61a8 c8c03be 02bddcc c8c03be 9e4cc08 3e7de24 02bddcc 3e7de24 47aa68c 02bddcc 9e4cc08 c8c03be 3bf3829 bdd0699 446c9f7 3bf3829 bdd0699 3bf3829 9e4cc08 02bddcc 9e4cc08 02bddcc 47aa68c 9e4cc08 02bddcc 47aa68c 9e4cc08 02bddcc bdd0699 3bf3829 9e4cc08 1e90ba6 dbf61a8 1e90ba6 dbf61a8 1e90ba6 1798c8d 1e90ba6 59b8843 10d64ed 1e90ba6 c8c03be 9e4cc08 3bf3829 9e4cc08 3bf3829 1e90ba6 00e3873 9e4cc08 1e90ba6 c8c03be bdd0699 47aa68c bdd0699 9e4cc08 c8c03be 1e90ba6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import gradio as gr
import re
import os
from docx import Document
from docx.shared import Inches
from ragas import evaluate, EvaluationDataset
from ragas.metrics import Faithfulness, FactualCorrectness, SemanticSimilarity, ResponseGroundedness, AnswerAccuracy
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
import matplotlib.pyplot as plt
import numpy as np
# Initialize OpenAI API
llm = ChatOpenAI(model="gpt-4.1")
embeddings = OpenAIEmbeddings()
evaluator_llm = LangchainLLMWrapper(llm)
# ----- Helper Functions -----
def read_docx(file):
"""Extract text from Word document."""
doc = Document(file)
return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
def clean_retrieved_context(raw_context):
# Remove unnecessary line breaks within paragraphs
cleaned = raw_context.replace("-\n", "").replace("\n", " ")
# Remove extra spaces clearly
cleaned = re.sub(r'\s+', ' ', cleaned)
# Return explicitly cleaned context
return cleaned.strip()
def format_ragas_results(ragas_results):
return [
{metric: f"{score*100:.2f}%" for metric, score in sample_scores.items()}
for sample_scores in ragas_results.scores
]
def plot_radar_chart(ragas_results):
# Extract first sample if multiple scores
if isinstance(ragas_results.scores, list):
scores_dict = ragas_results.scores[0] # first sample
else:
scores_dict = ragas_results.scores
labels = list(scores_dict.keys())
values = list(scores_dict.values())
# Close the loop for radar chart
values += values[:1]
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
ax.fill(angles, values, color='skyblue', alpha=0.4)
ax.plot(angles, values, color='blue', linewidth=2)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
ax.set_title('RAGAS Metrics Radar Chart', size=14, y=1.05)
plt.tight_layout()
chart_path = "radar_chart.png"
fig.savefig(chart_path)
plt.close(fig)
return chart_path
def interpret_ragas_results_with_gpt(formatted_scores: list, llm) -> str:
if not formatted_scores or not isinstance(formatted_scores[0], dict):
return "Invalid RAGAS scores provided."
score_text = "\n".join([f"{k}: {v}" for k, v in formatted_scores[0].items()])
prompt = f"""
You are an expert in RAGAS evaluation metrics to evaluate AI-generated content.
The following RAGAS evaluation scores are from a comparison between an AI-generated scientific case development document (SCDD) and a human-written version. This evaluation is conducted in the context of exploratory and novel scientific use cases. The AI-generated document may include new ideas, restructured concepts, or facts not explicitly mentioned in the human reference.
Interpret these scores with a **balanced and critical lens**:
- Acknowledge that the AI output may contain exploratory and novel content.
- However, evaluate the scores in light of both strengths and **potential weaknesses** or limitations.
- Consider how novelty, phrasing differences, or omissions might impact factual and alignment-based metrics.
- Cover both aspects of novelty. That is, new insights as well as inaccuracies.
- Do **not** start with phrases like "Certainly" or "Here's..." — **begin directly with the interpretation**.
RAGAS Scores:
{score_text}
Provide a short paragraph interpretation for each metric.
"""
response = llm.invoke(prompt)
return response.content.strip()
def generate_word_report(science_goal, ragas_results, radar_chart_path, interpretation):
doc = Document()
doc.add_heading("SCDD Evaluation Report", 0)
doc.add_heading("Science Goal", level=1)
doc.add_paragraph(science_goal)
doc.add_heading("RAGAS SCDD Evaluation Scores", level=1)
for metric, score in ragas_results.scores[0].items():
doc.add_paragraph(f"{metric}: {score*100:.2f}%")
doc.add_heading("RAGAS Metrics Chart", level=1)
doc.add_picture(radar_chart_path, width=Inches(5))
doc.add_heading("GPT-4.1 Interpretation of RAGAS AI-SCDD Evaluation", level=1)
doc.add_paragraph(interpretation)
output_path = "SCDD_Evaluation_Report.docx"
doc.save(output_path)
return output_path
def evaluate_scdd(ai_scdd_file, human_scdd_file, user_input):
# Read uploaded documents
ai_scdd_text = read_docx(ai_scdd_file)
human_scdd_text = read_docx(human_scdd_file)
context = [clean_retrieved_context(chunk) for chunk in human_scdd_text]
# Create RAGAS evaluation dataset
dataset = [{
"user_input": user_input if user_input else "N/A",
"retrieved_contexts": context,
"response": ai_scdd_text,
"reference": human_scdd_text
}]
evaluation_dataset = EvaluationDataset.from_list(dataset)
# Define metrics
metrics = [
SemanticSimilarity(),
ResponseGroundedness(),
Faithfulness(),
AnswerAccuracy(),
FactualCorrectness(coverage="low", atomicity="low")
]
# Run RAGAS evaluation
ragas_result = evaluate(
dataset=evaluation_dataset,
metrics=metrics,
llm=evaluator_llm,
embeddings=embeddings
)
# RAGAS metrics outputs
formatted_scores = format_ragas_results(ragas_result)
radar_chart_path = plot_radar_chart(ragas_result)
interpretation = interpret_ragas_results_with_gpt(formatted_scores, llm)
word_report_path = generate_word_report(user_input, ragas_result, radar_chart_path, interpretation)
return formatted_scores, radar_chart_path, interpretation, word_report_path
# ----- Gradio Interface -----
interface = gr.Interface(
fn=evaluate_scdd,
inputs=[
gr.File(label="Upload AI-Generated SCDD (Word .docx)", type='filepath'),
gr.File(label="Upload Human-Generated SCDD (Word .docx)", type='filepath'),
gr.Textbox(label="Science Goal", placeholder="Enter science goal here..."),
],
outputs=[
gr.JSON(label="RAGAS Evaluation Scores"),
gr.Image(label="RAGAS Metrics Radar Chart"),
gr.Textbox(label="GPT-4.1 Interpretation of RAGAS Results"),
gr.File(label="Download Word Report")
],
title="RAGAS Evaluation: AI vs Human SCDD",
description="Compare AI-generated and human-generated science case documents using RAGAS LLM-powered metrics"
)
if __name__ == "__main__":
interface.launch()
|