import nltk import mlflow import hyperopt from hyperopt import fmin, tpe, hp, STATUS_OK, Trials from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM import torch from sentence_transformers import SentenceTransformer, util from bert_score import score from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction from rouge import Rouge from tqdm import tqdm from datasets import load_metric # Download necessary NLTK data nltk.download('punkt') nltk.download('stopwords') # --- Load pre-trained models --- # Research and update these with the most recent and powerful Portuguese models semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased') perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name) perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name) # Load Hugging Face metrics bertscore_metric = load_metric("bertscore") bleu_metric = load_metric("bleu") rouge_metric = load_metric("rouge") meteor_metric = load_metric("meteor") # Additional metric # Load a powerful LLM for generating and judging content generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available generator = pipeline("text-generation", model=generator_model_name) judge_model_name = generator_model_name # Using the same model for judging judge = pipeline("text-generation", model=judge_model_name) # --- Helper Functions --- def calculate_perplexity(text): """Calculates perplexity of text using a Portuguese LLM model.""" try: with torch.no_grad(): tokenize_input = perplexity_tokenizer.tokenize(text) tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt') loss = perplexity_model(tensor_input, labels=tensor_input)[0] return torch.exp(loss).item() except Exception as e: print(f"Error calculating perplexity: {e}") return float('inf') def estimate_semantic_similarity(generated_text, reference_text): """Estimates semantic similarity using a Portuguese Sentence Transformer.""" try: embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True) embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True) cosine_sim = util.pytorch_cos_sim(embedding1, embedding2) return cosine_sim.item() except Exception as e: print(f"Error calculating semantic similarity: {e}") return 0.0 def calculate_metrics(generated_text, reference_text): """Calculates BERTScore, BLEU, ROUGE, and METEOR metrics.""" results = {} try: results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0] except Exception as e: print(f"Error calculating BERTScore: {e}") results['bertscore'] = None try: bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]]) results['bleu'] = bleu_results['bleu'] except Exception as e: print(f"Error calculating BLEU: {e}") results['bleu'] = None try: rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text]) results['rougeL'] = rouge_results['rougeL'] except Exception as e: print(f"Error calculating ROUGE: {e}") results['rougeL'] = None try: meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text]) results['meteor'] = meteor_results['meteor'] except Exception as e: print(f"Error calculating METEOR: {e}") results['meteor'] = None return results def get_llm_judgment(generated_text, reference_text): """Gets a judgment from a powerful LLM on the quality of the generated text.""" prompt = f""" You are an expert in evaluating educational content. Please evaluate the following generated text based on its accuracy, relevance, and clarity, compared to the provided reference text. Reference Text: {reference_text} Generated Text: {generated_text} Provide your judgment as one of the following categories: - "no issues": The generated text is accurate, relevant, and clear. - "minor issues": The generated text has some minor issues, but is mostly acceptable. - "major issues": The generated text has significant issues and needs substantial revision. """ judgment = judge(prompt, max_length=50)[0]['generated_text'].strip() return judgment # --- Content Analysis Function --- def analyze_content_for_review(generated_text, reference_text, similarity_threshold, bertscore_threshold, bleu_threshold, rouge_threshold, meteor_threshold): """Analyzes content and flags potential issues based on provided thresholds and LLM judgment.""" similarity = estimate_semantic_similarity(generated_text, reference_text) metrics = calculate_metrics(generated_text, reference_text) llm_judgment = get_llm_judgment(generated_text, reference_text) issues = [] if similarity < similarity_threshold: issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.") if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold: issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.") if metrics['bleu'] and metrics['bleu'] < bleu_threshold: issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.") if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold: issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.") if metrics['meteor'] and metrics['meteor'] < meteor_threshold: issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.") # Use LLM judgment as the primary decision-maker if llm_judgment == "major issues": review_flag = True explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues) elif llm_judgment == "minor issues": review_flag = True explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues) else: review_flag = False explanation = "LLM Judgment: **No Issues**" return { 'review_flag': review_flag, 'explanation': explanation, 'semantic_similarity': similarity, 'metrics': metrics, 'llm_judgment': llm_judgment, 'generated_text': generated_text, 'reference_text': reference_text } # --- Threshold Optimization Functions --- def generate_educational_content(topic, num_sections=3): """Generates educational content with chapters, topics, sections, and subsections.""" prompt = f""" Generate a chapter of educational content on the topic of "{topic}". The chapter should include {num_sections} sections, each with at least one subsection. The content should be factually accurate, well-organized, and written in clear and concise Portuguese. """ generated_content = generator(prompt, max_length=1000)[0]['generated_text'] return generated_content def objective(params): """Objective function for Hyperopt to minimize.""" similarity_threshold = params['similarity_threshold'] bertscore_threshold = params['bertscore_threshold'] bleu_threshold = params['bleu_threshold'] rouge_threshold = params['rouge_threshold'] meteor_threshold = params['meteor_threshold'] # Generate AI-created data topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics generated_texts = [] reference_texts = [] for topic in topics: reference_text = generate_educational_content(topic) generated_text = generate_educational_content(topic) generated_texts.append(generated_text) reference_texts.append(reference_text) total_errors = 0 for gen_text, ref_text in zip(generated_texts, reference_texts): result = analyze_content_for_review(gen_text, ref_text, similarity_threshold, bertscore_threshold, bleu_threshold, rouge_threshold, meteor_threshold) if result['review_flag'] and result['llm_judgment'] == "no issues": total_errors += 1 # Log metrics and parameters to MLflow with mlflow.start_run(): mlflow.log_params(params) mlflow.log_metric("total_errors", total_errors) return {'loss': total_errors, 'status': STATUS_OK} # --- Main Execution --- if __name__ == "__main__": # 1. Threshold Optimization Phase mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI search_space = { # Hyperparameter search space 'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9), 'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95), 'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8), 'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7), 'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7) } trials = Trials() best_thresholds = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=50, # Adjust the number of evaluations as needed trials=trials) print("Best thresholds found:", best_thresholds) # 2. Content Evaluation Phase (using the best thresholds) new_generated_text = generate_educational_content("Matemática") # Example new_reference_text = "Content from your educational material..." evaluation_result = analyze_content_for_review( new_generated_text, new_reference_text, best_thresholds['similarity_threshold'], best_thresholds['bertscore_threshold'], best_thresholds['bleu_threshold'], best_thresholds['rouge_threshold'], best_thresholds['meteor_threshold'] ) print("\n----- Evaluation Result -----") print(f"Review Flag: {evaluation_result['review_flag']}") print(f"Explanation: {evaluation_result['explanation']}")