Spaces:

Concepta
/

metrics_analyzer

No application file

File size: 10,948 Bytes

3d97611

import nltk
import mlflow
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer, util
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from tqdm import tqdm
from datasets import load_metric

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# --- Load pre-trained models ---
# Research and update these with the most recent and powerful Portuguese models
semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased') 
perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab"  # Example: More recent GPT-like model
perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name)
perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)

# Load Hugging Face metrics
bertscore_metric = load_metric("bertscore")
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
meteor_metric = load_metric("meteor")  # Additional metric

# Load a powerful LLM for generating and judging content
generator_model_name = "gpt-3.5-turbo"  # Or GPT-4 or Gemini if available
generator = pipeline("text-generation", model=generator_model_name)
judge_model_name = generator_model_name  # Using the same model for judging
judge = pipeline("text-generation", model=judge_model_name)

# --- Helper Functions ---
def calculate_perplexity(text):
    """Calculates perplexity of text using a Portuguese LLM model."""
    try:
        with torch.no_grad():
            tokenize_input = perplexity_tokenizer.tokenize(text)
            tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt')
            loss = perplexity_model(tensor_input, labels=tensor_input)[0]
            return torch.exp(loss).item()
    except Exception as e:
        print(f"Error calculating perplexity: {e}")
        return float('inf')


def estimate_semantic_similarity(generated_text, reference_text):
    """Estimates semantic similarity using a Portuguese Sentence Transformer."""
    try:
        embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True)
        embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True)
        cosine_sim = util.pytorch_cos_sim(embedding1, embedding2)
        return cosine_sim.item()
    except Exception as e:
        print(f"Error calculating semantic similarity: {e}")
        return 0.0 


def calculate_metrics(generated_text, reference_text):
    """Calculates BERTScore, BLEU, ROUGE, and METEOR metrics."""
    results = {}
    try:
        results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
    except Exception as e:
        print(f"Error calculating BERTScore: {e}")
        results['bertscore'] = None

    try:
        bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
        results['bleu'] = bleu_results['bleu']
    except Exception as e:
        print(f"Error calculating BLEU: {e}")
        results['bleu'] = None

    try:
        rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text])
        results['rougeL'] = rouge_results['rougeL']
    except Exception as e:
        print(f"Error calculating ROUGE: {e}")
        results['rougeL'] = None

    try:
        meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text])
        results['meteor'] = meteor_results['meteor']
    except Exception as e:
        print(f"Error calculating METEOR: {e}")
        results['meteor'] = None

    return results


def get_llm_judgment(generated_text, reference_text):
    """Gets a judgment from a powerful LLM on the quality of the generated text."""
    prompt = f"""
    You are an expert in evaluating educational content. 
    Please evaluate the following generated text based on its accuracy, relevance, and clarity, 
    compared to the provided reference text.

    Reference Text:
    {reference_text}

    Generated Text:
    {generated_text}

    Provide your judgment as one of the following categories:
    - "no issues": The generated text is accurate, relevant, and clear.
    - "minor issues": The generated text has some minor issues, but is mostly acceptable.
    - "major issues": The generated text has significant issues and needs substantial revision. 
    """
    judgment = judge(prompt, max_length=50)[0]['generated_text'].strip()
    return judgment
    

# --- Content Analysis Function ---
def analyze_content_for_review(generated_text, reference_text, 
                                 similarity_threshold, 
                                 bertscore_threshold,
                                 bleu_threshold,
                                 rouge_threshold,
                                 meteor_threshold):
    """Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
    similarity = estimate_semantic_similarity(generated_text, reference_text)
    metrics = calculate_metrics(generated_text, reference_text)
    llm_judgment = get_llm_judgment(generated_text, reference_text)

    issues = []
    if similarity < similarity_threshold:
        issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
    if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
        issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
    if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
        issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
    if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
        issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
    if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
        issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")

    # Use LLM judgment as the primary decision-maker
    if llm_judgment == "major issues":
        review_flag = True
        explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
    elif llm_judgment == "minor issues":
        review_flag = True
        explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
    else:
        review_flag = False
        explanation = "LLM Judgment: **No Issues**"

    return {
        'review_flag': review_flag,
        'explanation': explanation,
        'semantic_similarity': similarity,
        'metrics': metrics,
        'llm_judgment': llm_judgment,
        'generated_text': generated_text,
        'reference_text': reference_text
    }


# --- Threshold Optimization Functions ---
def generate_educational_content(topic, num_sections=3):
    """Generates educational content with chapters, topics, sections, and subsections."""
    prompt = f"""
    Generate a chapter of educational content on the topic of "{topic}".
    The chapter should include {num_sections} sections, each with at least 
    one subsection. The content should be factually accurate, well-organized, 
    and written in clear and concise Portuguese.
    """
    generated_content = generator(prompt, max_length=1000)[0]['generated_text']
    return generated_content

def objective(params):
    """Objective function for Hyperopt to minimize."""
    similarity_threshold = params['similarity_threshold']
    bertscore_threshold = params['bertscore_threshold']
    bleu_threshold = params['bleu_threshold']
    rouge_threshold = params['rouge_threshold']
    meteor_threshold = params['meteor_threshold']

    # Generate AI-created data
    topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"]  # More topics
    generated_texts = []
    reference_texts = []
    for topic in topics:
        reference_text = generate_educational_content(topic) 
        generated_text = generate_educational_content(topic) 
        generated_texts.append(generated_text)
        reference_texts.append(reference_text)

    total_errors = 0
    for gen_text, ref_text in zip(generated_texts, reference_texts):
        result = analyze_content_for_review(gen_text, ref_text,
                                            similarity_threshold,
                                            bertscore_threshold,
                                            bleu_threshold,
                                            rouge_threshold,
                                            meteor_threshold)
        if result['review_flag'] and result['llm_judgment'] == "no issues":
            total_errors += 1 

    # Log metrics and parameters to MLflow
    with mlflow.start_run():
        mlflow.log_params(params)
        mlflow.log_metric("total_errors", total_errors)

    return {'loss': total_errors, 'status': STATUS_OK}

    
# --- Main Execution ---
if __name__ == "__main__":
    # 1. Threshold Optimization Phase
    mlflow.set_tracking_uri("http://localhost:5000")  # Or your MLflow server URI
    search_space = { # Hyperparameter search space
        'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9),
        'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95),
        'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8),
        'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7),
        'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7)
    }
    trials = Trials()
    best_thresholds = fmin(fn=objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,  # Adjust the number of evaluations as needed
                trials=trials)
    print("Best thresholds found:", best_thresholds)

    # 2. Content Evaluation Phase (using the best thresholds)
    new_generated_text = generate_educational_content("Matemática") # Example
    new_reference_text = "Content from your educational material..." 

    evaluation_result = analyze_content_for_review(
        new_generated_text, new_reference_text,
        best_thresholds['similarity_threshold'],
        best_thresholds['bertscore_threshold'],
        best_thresholds['bleu_threshold'],
        best_thresholds['rouge_threshold'],
        best_thresholds['meteor_threshold']
    )

    print("\n----- Evaluation Result -----")
    print(f"Review Flag: {evaluation_result['review_flag']}")
    print(f"Explanation: {evaluation_result['explanation']}")