Spaces:

aquibmoin
/

AI-SciDoc-Evaluator

Sleeping

File size: 6,683 Bytes

1e90ba6
dbf61a8
9e4cc08
1e90ba6
9e4cc08
1e90ba6
59b8843
1e90ba6
 
c8c03be
 
1e90ba6
 
 
 
 
 
 
 
 
 
 
 
 
 
dbf61a8
 
 
 
 
 
 
 
 
 
c8c03be
02bddcc
 
 
 
c8c03be
9e4cc08
3e7de24
 
 
 
 
 
02bddcc
 
3e7de24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47aa68c
02bddcc
9e4cc08
 
 
 
c8c03be
3bf3829
 
 
 
 
 
 
 
bdd0699
 
446c9f7
 
 
 
 
 
 
 
3bf3829
 
 
 
bdd0699
3bf3829
 
 
 
 
 
 
9e4cc08
 
02bddcc
9e4cc08
 
02bddcc
47aa68c
9e4cc08
 
02bddcc
47aa68c
9e4cc08
02bddcc
bdd0699
3bf3829
 
9e4cc08
 
 
1e90ba6
 
 
 
 
 
dbf61a8
 
1e90ba6
 
 
 
dbf61a8
1e90ba6
 
 
 
 
 
 
 
1798c8d
 
1e90ba6
59b8843
10d64ed
1e90ba6
 
 
 
 
 
 
 
 
 
c8c03be
 
 
9e4cc08
3bf3829
 
9e4cc08
3bf3829
1e90ba6
 
 
 
 
 
 
00e3873
 
9e4cc08
1e90ba6
c8c03be
bdd0699
47aa68c
bdd0699
9e4cc08
c8c03be
1e90ba6

import gradio as gr
import re
import os
from docx import Document
from docx.shared import Inches
from ragas import evaluate, EvaluationDataset
from ragas.metrics import Faithfulness, FactualCorrectness, SemanticSimilarity, ResponseGroundedness, AnswerAccuracy
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
import matplotlib.pyplot as plt
import numpy as np

# Initialize OpenAI API
llm = ChatOpenAI(model="gpt-4.1")  
embeddings = OpenAIEmbeddings()
evaluator_llm = LangchainLLMWrapper(llm)


# ----- Helper Functions -----

def read_docx(file):
    """Extract text from Word document."""
    doc = Document(file)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

def clean_retrieved_context(raw_context):
    # Remove unnecessary line breaks within paragraphs
    cleaned = raw_context.replace("-\n", "").replace("\n", " ")

    # Remove extra spaces clearly
    cleaned = re.sub(r'\s+', ' ', cleaned)

    # Return explicitly cleaned context
    return cleaned.strip()

def format_ragas_results(ragas_results):
    return [
        {metric: f"{score*100:.2f}%" for metric, score in sample_scores.items()}
        for sample_scores in ragas_results.scores
    ]

def plot_radar_chart(ragas_results):
    # Extract first sample if multiple scores
    if isinstance(ragas_results.scores, list):
        scores_dict = ragas_results.scores[0]  # first sample
    else:
        scores_dict = ragas_results.scores

    labels = list(scores_dict.keys())
    values = list(scores_dict.values())

    # Close the loop for radar chart
    values += values[:1]
    angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))

    ax.fill(angles, values, color='skyblue', alpha=0.4)
    ax.plot(angles, values, color='blue', linewidth=2)

    ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
    ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'])

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels)

    ax.set_title('RAGAS Metrics Radar Chart', size=14, y=1.05)
    plt.tight_layout()

    chart_path = "radar_chart.png"
    fig.savefig(chart_path)
    plt.close(fig)
    return chart_path

def interpret_ragas_results_with_gpt(formatted_scores: list, llm) -> str:
    
    if not formatted_scores or not isinstance(formatted_scores[0], dict):
        return "Invalid RAGAS scores provided."

    score_text = "\n".join([f"{k}: {v}" for k, v in formatted_scores[0].items()])

    prompt = f"""
You are an expert in RAGAS evaluation metrics to evaluate AI-generated content. 

The following RAGAS evaluation scores are from a comparison between an AI-generated scientific case development document (SCDD) and a human-written version. This evaluation is conducted in the context of exploratory and novel scientific use cases. The AI-generated document may include new ideas, restructured concepts, or facts not explicitly mentioned in the human reference.

Interpret these scores with a **balanced and critical lens**:
- Acknowledge that the AI output may contain exploratory and novel content.
- However, evaluate the scores in light of both strengths and **potential weaknesses** or limitations.
- Consider how novelty, phrasing differences, or omissions might impact factual and alignment-based metrics.
- Cover both aspects of novelty. That is, new insights as well as inaccuracies.
- Do **not** start with phrases like "Certainly" or "Here's..." — **begin directly with the interpretation**.

RAGAS Scores:
{score_text}

Provide a short paragraph interpretation for each metric.
"""

    response = llm.invoke(prompt)
    return response.content.strip()


def generate_word_report(science_goal, ragas_results, radar_chart_path, interpretation):
    doc = Document()
    doc.add_heading("SCDD Evaluation Report", 0)

    doc.add_heading("Science Goal", level=1)
    doc.add_paragraph(science_goal)

    doc.add_heading("RAGAS SCDD Evaluation Scores", level=1)
    for metric, score in ragas_results.scores[0].items():
        doc.add_paragraph(f"{metric}: {score*100:.2f}%")

    doc.add_heading("RAGAS Metrics Chart", level=1)
    doc.add_picture(radar_chart_path, width=Inches(5))

    doc.add_heading("GPT-4.1 Interpretation of RAGAS AI-SCDD Evaluation", level=1)
    doc.add_paragraph(interpretation)
    
    output_path = "SCDD_Evaluation_Report.docx"
    doc.save(output_path)
    return output_path

def evaluate_scdd(ai_scdd_file, human_scdd_file, user_input):
    
    # Read uploaded documents
    ai_scdd_text = read_docx(ai_scdd_file)
    human_scdd_text = read_docx(human_scdd_file)
    
    context = [clean_retrieved_context(chunk) for chunk in human_scdd_text]

    # Create RAGAS evaluation dataset
    dataset = [{
        "user_input": user_input if user_input else "N/A",
        "retrieved_contexts": context,
        "response": ai_scdd_text,
        "reference": human_scdd_text
    }]

    evaluation_dataset = EvaluationDataset.from_list(dataset)

    # Define metrics
    metrics = [
        SemanticSimilarity(),
        ResponseGroundedness(),
        Faithfulness(),
        AnswerAccuracy(),
        FactualCorrectness(coverage="low", atomicity="low")
    ]

    # Run RAGAS evaluation
    ragas_result = evaluate(
        dataset=evaluation_dataset,
        metrics=metrics,
        llm=evaluator_llm,
        embeddings=embeddings
    )

    # RAGAS metrics outputs
    
    formatted_scores = format_ragas_results(ragas_result)
    radar_chart_path = plot_radar_chart(ragas_result)
    interpretation = interpret_ragas_results_with_gpt(formatted_scores, llm)
    word_report_path = generate_word_report(user_input, ragas_result, radar_chart_path, interpretation)

    return formatted_scores, radar_chart_path, interpretation, word_report_path


# ----- Gradio Interface -----

interface = gr.Interface(
    fn=evaluate_scdd,
    inputs=[
        gr.File(label="Upload AI-Generated SCDD (Word .docx)", type='filepath'),
        gr.File(label="Upload Human-Generated SCDD (Word .docx)", type='filepath'),
        gr.Textbox(label="Science Goal", placeholder="Enter science goal here..."),
    ],
    outputs=[
        gr.JSON(label="RAGAS Evaluation Scores"),
        gr.Image(label="RAGAS Metrics Radar Chart"),
        gr.Textbox(label="GPT-4.1 Interpretation of RAGAS Results"),
        gr.File(label="Download Word Report")
    ],
    title="RAGAS Evaluation: AI vs Human SCDD",
    description="Compare AI-generated and human-generated science case documents using RAGAS LLM-powered metrics"
)

if __name__ == "__main__":
    interface.launch()