import gradio as gr import re import os from docx import Document from docx.shared import Inches from ragas import evaluate, EvaluationDataset from ragas.metrics import Faithfulness, FactualCorrectness, SemanticSimilarity, ResponseGroundedness, AnswerAccuracy from langchain_openai import ChatOpenAI, OpenAIEmbeddings from ragas.llms import LangchainLLMWrapper import matplotlib.pyplot as plt import numpy as np # Initialize OpenAI API llm = ChatOpenAI(model="gpt-4.1") embeddings = OpenAIEmbeddings() evaluator_llm = LangchainLLMWrapper(llm) # ----- Helper Functions ----- def read_docx(file): """Extract text from Word document.""" doc = Document(file) return "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) def clean_retrieved_context(raw_context): # Remove unnecessary line breaks within paragraphs cleaned = raw_context.replace("-\n", "").replace("\n", " ") # Remove extra spaces clearly cleaned = re.sub(r'\s+', ' ', cleaned) # Return explicitly cleaned context return cleaned.strip() def format_ragas_results(ragas_results): return [ {metric: f"{score*100:.2f}%" for metric, score in sample_scores.items()} for sample_scores in ragas_results.scores ] def plot_radar_chart(ragas_results): # Extract first sample if multiple scores if isinstance(ragas_results.scores, list): scores_dict = ragas_results.scores[0] # first sample else: scores_dict = ragas_results.scores labels = list(scores_dict.keys()) values = list(scores_dict.values()) # Close the loop for radar chart values += values[:1] angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist() angles += angles[:1] fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True)) ax.fill(angles, values, color='skyblue', alpha=0.4) ax.plot(angles, values, color='blue', linewidth=2) ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0]) ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0']) ax.set_xticks(angles[:-1]) ax.set_xticklabels(labels) ax.set_title('RAGAS Metrics Radar Chart', size=14, y=1.05) plt.tight_layout() chart_path = "radar_chart.png" fig.savefig(chart_path) plt.close(fig) return chart_path def interpret_ragas_results_with_gpt(formatted_scores: list, llm) -> str: if not formatted_scores or not isinstance(formatted_scores[0], dict): return "Invalid RAGAS scores provided." score_text = "\n".join([f"{k}: {v}" for k, v in formatted_scores[0].items()]) prompt = f""" You are an expert in RAGAS evaluation metrics to evaluate AI-generated content. The following RAGAS evaluation scores are from a comparison between an AI-generated scientific case development document (SCDD) and a human-written version. This evaluation is conducted in the context of exploratory and novel scientific use cases. The AI-generated document may include new ideas, restructured concepts, or facts not explicitly mentioned in the human reference. Interpret these scores with a **balanced and critical lens**: - Acknowledge that the AI output may contain exploratory and novel content. - However, evaluate the scores in light of both strengths and **potential weaknesses** or limitations. - Consider how novelty, phrasing differences, or omissions might impact factual and alignment-based metrics. - Cover both aspects of novelty. That is, new insights as well as inaccuracies. - Do **not** start with phrases like "Certainly" or "Here's..." — **begin directly with the interpretation**. RAGAS Scores: {score_text} Provide a short paragraph interpretation for each metric. """ response = llm.invoke(prompt) return response.content.strip() def generate_word_report(science_goal, ragas_results, radar_chart_path, interpretation): doc = Document() doc.add_heading("SCDD Evaluation Report", 0) doc.add_heading("Science Goal", level=1) doc.add_paragraph(science_goal) doc.add_heading("RAGAS SCDD Evaluation Scores", level=1) for metric, score in ragas_results.scores[0].items(): doc.add_paragraph(f"{metric}: {score*100:.2f}%") doc.add_heading("RAGAS Metrics Chart", level=1) doc.add_picture(radar_chart_path, width=Inches(5)) doc.add_heading("GPT-4.1 Interpretation of RAGAS AI-SCDD Evaluation", level=1) doc.add_paragraph(interpretation) output_path = "SCDD_Evaluation_Report.docx" doc.save(output_path) return output_path def evaluate_scdd(ai_scdd_file, human_scdd_file, user_input): # Read uploaded documents ai_scdd_text = read_docx(ai_scdd_file) human_scdd_text = read_docx(human_scdd_file) context = [clean_retrieved_context(chunk) for chunk in human_scdd_text] # Create RAGAS evaluation dataset dataset = [{ "user_input": user_input if user_input else "N/A", "retrieved_contexts": context, "response": ai_scdd_text, "reference": human_scdd_text }] evaluation_dataset = EvaluationDataset.from_list(dataset) # Define metrics metrics = [ SemanticSimilarity(), ResponseGroundedness(), Faithfulness(), AnswerAccuracy(), FactualCorrectness(coverage="low", atomicity="low") ] # Run RAGAS evaluation ragas_result = evaluate( dataset=evaluation_dataset, metrics=metrics, llm=evaluator_llm, embeddings=embeddings ) # RAGAS metrics outputs formatted_scores = format_ragas_results(ragas_result) radar_chart_path = plot_radar_chart(ragas_result) interpretation = interpret_ragas_results_with_gpt(formatted_scores, llm) word_report_path = generate_word_report(user_input, ragas_result, radar_chart_path, interpretation) return formatted_scores, radar_chart_path, interpretation, word_report_path # ----- Gradio Interface ----- interface = gr.Interface( fn=evaluate_scdd, inputs=[ gr.File(label="Upload AI-Generated SCDD (Word .docx)", type='filepath'), gr.File(label="Upload Human-Generated SCDD (Word .docx)", type='filepath'), gr.Textbox(label="Science Goal", placeholder="Enter science goal here..."), ], outputs=[ gr.JSON(label="RAGAS Evaluation Scores"), gr.Image(label="RAGAS Metrics Radar Chart"), gr.Textbox(label="GPT-4.1 Interpretation of RAGAS Results"), gr.File(label="Download Word Report") ], title="RAGAS Evaluation: AI vs Human SCDD", description="Compare AI-generated and human-generated science case documents using RAGAS LLM-powered metrics" ) if __name__ == "__main__": interface.launch()