metrics_analyzer / metrics.py
Concepta's picture
Create metrics.py
3d97611 verified
raw
history blame
No virus
10.9 kB
import nltk
import mlflow
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer, util
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from tqdm import tqdm
from datasets import load_metric
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# --- Load pre-trained models ---
# Research and update these with the most recent and powerful Portuguese models
semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model
perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name)
perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)
# Load Hugging Face metrics
bertscore_metric = load_metric("bertscore")
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
meteor_metric = load_metric("meteor") # Additional metric
# Load a powerful LLM for generating and judging content
generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available
generator = pipeline("text-generation", model=generator_model_name)
judge_model_name = generator_model_name # Using the same model for judging
judge = pipeline("text-generation", model=judge_model_name)
# --- Helper Functions ---
def calculate_perplexity(text):
"""Calculates perplexity of text using a Portuguese LLM model."""
try:
with torch.no_grad():
tokenize_input = perplexity_tokenizer.tokenize(text)
tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt')
loss = perplexity_model(tensor_input, labels=tensor_input)[0]
return torch.exp(loss).item()
except Exception as e:
print(f"Error calculating perplexity: {e}")
return float('inf')
def estimate_semantic_similarity(generated_text, reference_text):
"""Estimates semantic similarity using a Portuguese Sentence Transformer."""
try:
embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True)
embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True)
cosine_sim = util.pytorch_cos_sim(embedding1, embedding2)
return cosine_sim.item()
except Exception as e:
print(f"Error calculating semantic similarity: {e}")
return 0.0
def calculate_metrics(generated_text, reference_text):
"""Calculates BERTScore, BLEU, ROUGE, and METEOR metrics."""
results = {}
try:
results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
except Exception as e:
print(f"Error calculating BERTScore: {e}")
results['bertscore'] = None
try:
bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
results['bleu'] = bleu_results['bleu']
except Exception as e:
print(f"Error calculating BLEU: {e}")
results['bleu'] = None
try:
rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text])
results['rougeL'] = rouge_results['rougeL']
except Exception as e:
print(f"Error calculating ROUGE: {e}")
results['rougeL'] = None
try:
meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text])
results['meteor'] = meteor_results['meteor']
except Exception as e:
print(f"Error calculating METEOR: {e}")
results['meteor'] = None
return results
def get_llm_judgment(generated_text, reference_text):
"""Gets a judgment from a powerful LLM on the quality of the generated text."""
prompt = f"""
You are an expert in evaluating educational content.
Please evaluate the following generated text based on its accuracy, relevance, and clarity,
compared to the provided reference text.
Reference Text:
{reference_text}
Generated Text:
{generated_text}
Provide your judgment as one of the following categories:
- "no issues": The generated text is accurate, relevant, and clear.
- "minor issues": The generated text has some minor issues, but is mostly acceptable.
- "major issues": The generated text has significant issues and needs substantial revision.
"""
judgment = judge(prompt, max_length=50)[0]['generated_text'].strip()
return judgment
# --- Content Analysis Function ---
def analyze_content_for_review(generated_text, reference_text,
similarity_threshold,
bertscore_threshold,
bleu_threshold,
rouge_threshold,
meteor_threshold):
"""Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
similarity = estimate_semantic_similarity(generated_text, reference_text)
metrics = calculate_metrics(generated_text, reference_text)
llm_judgment = get_llm_judgment(generated_text, reference_text)
issues = []
if similarity < similarity_threshold:
issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")
# Use LLM judgment as the primary decision-maker
if llm_judgment == "major issues":
review_flag = True
explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
elif llm_judgment == "minor issues":
review_flag = True
explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
else:
review_flag = False
explanation = "LLM Judgment: **No Issues**"
return {
'review_flag': review_flag,
'explanation': explanation,
'semantic_similarity': similarity,
'metrics': metrics,
'llm_judgment': llm_judgment,
'generated_text': generated_text,
'reference_text': reference_text
}
# --- Threshold Optimization Functions ---
def generate_educational_content(topic, num_sections=3):
"""Generates educational content with chapters, topics, sections, and subsections."""
prompt = f"""
Generate a chapter of educational content on the topic of "{topic}".
The chapter should include {num_sections} sections, each with at least
one subsection. The content should be factually accurate, well-organized,
and written in clear and concise Portuguese.
"""
generated_content = generator(prompt, max_length=1000)[0]['generated_text']
return generated_content
def objective(params):
"""Objective function for Hyperopt to minimize."""
similarity_threshold = params['similarity_threshold']
bertscore_threshold = params['bertscore_threshold']
bleu_threshold = params['bleu_threshold']
rouge_threshold = params['rouge_threshold']
meteor_threshold = params['meteor_threshold']
# Generate AI-created data
topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics
generated_texts = []
reference_texts = []
for topic in topics:
reference_text = generate_educational_content(topic)
generated_text = generate_educational_content(topic)
generated_texts.append(generated_text)
reference_texts.append(reference_text)
total_errors = 0
for gen_text, ref_text in zip(generated_texts, reference_texts):
result = analyze_content_for_review(gen_text, ref_text,
similarity_threshold,
bertscore_threshold,
bleu_threshold,
rouge_threshold,
meteor_threshold)
if result['review_flag'] and result['llm_judgment'] == "no issues":
total_errors += 1
# Log metrics and parameters to MLflow
with mlflow.start_run():
mlflow.log_params(params)
mlflow.log_metric("total_errors", total_errors)
return {'loss': total_errors, 'status': STATUS_OK}
# --- Main Execution ---
if __name__ == "__main__":
# 1. Threshold Optimization Phase
mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI
search_space = { # Hyperparameter search space
'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9),
'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95),
'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8),
'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7),
'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7)
}
trials = Trials()
best_thresholds = fmin(fn=objective,
space=search_space,
algo=tpe.suggest,
max_evals=50, # Adjust the number of evaluations as needed
trials=trials)
print("Best thresholds found:", best_thresholds)
# 2. Content Evaluation Phase (using the best thresholds)
new_generated_text = generate_educational_content("Matemática") # Example
new_reference_text = "Content from your educational material..."
evaluation_result = analyze_content_for_review(
new_generated_text, new_reference_text,
best_thresholds['similarity_threshold'],
best_thresholds['bertscore_threshold'],
best_thresholds['bleu_threshold'],
best_thresholds['rouge_threshold'],
best_thresholds['meteor_threshold']
)
print("\n----- Evaluation Result -----")
print(f"Review Flag: {evaluation_result['review_flag']}")
print(f"Explanation: {evaluation_result['explanation']}")