import pickle import time from collections import Counter from copy import deepcopy import nltk import numpy as np import spacy from nltk.corpus import stopwords from textstat import textstat from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from bert_score import score from scipy.stats import entropy nltk.download("punkt") nltk.download("averaged_perceptron_tagger") nltk.download("stopwords") nlp = spacy.load("en_core_web_sm") class T5Model: """ A class to encapsulate a T5 summarization model. Parameters ---------- model_name : str The name of the pretrained T5 model. """ def __init__(self, model_name): self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) def summarize(self, text): """ Generate a summary for the given text. Tokenize -> generate the summary -> decode the text. Parameters ---------- text : str The input text to summarize. Returns ------- summary : str The generated summary. elapsed_time : float The time taken for summarization in seconds. """ inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True) start_time = time.time() outputs = self.model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True) end_time = time.time() summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return summary, end_time - start_time class MetaModel: """ A meta model that selects the best T5Model based on extracted features and a base classifier. Parameters ---------- model_names : list of str List of pretrained T5 model names. base_classifier : object A classifier instance used to predict the best model. tolerance : float, optional Tolerance threshold for model selection (default is 0.01). """ def __init__(self, model_names, base_classifier, tolerance=0.01): self.models = {name: T5Model(name) for name in model_names} self.base_classifier = deepcopy(base_classifier) self.tolerance = tolerance def fit(self, texts, summaries): """ Fit the base classifier using extracted features and best model labels. Parameters ---------- texts : list of str List of input texts. summaries : list of str List of reference summaries. """ X = np.array([list(extract_features(text).values()) for text in texts]) y = get_best_model(self.models, texts, summaries, self.tolerance) self.base_classifier.fit(X, y) def summarize(self, text): """ Summarize text using the predicted best model. Parameters ---------- text : str The input text to summarize. Returns ------- summary : str The generated summary. elapsed_time : float The time taken for summarization in seconds. """ features = np.array(list(extract_features(text).values()))[np.newaxis, :] predicted_model_index = self.base_classifier.predict(features)[0] predicted_model_name = list(self.models.keys())[predicted_model_index] return self.models[predicted_model_name].summarize(text) def save_object(obj, filename): with open(filename, "wb") as f: pickle.dump(obj, f) def load_object(filename): with open(filename, "rb") as f: return pickle.load(f) def get_best_model(models, texts, summaries, tolerance): """ Determine the best model for each text based on BERTScore and summarization time. Parameters ---------- models : dict Dictionary mapping model names to T5Model instances. texts : list of str List of input texts. summaries : list of str List of reference summaries. tolerance : float Tolerance threshold for model selection. Returns ------- y : np.ndarray Array of indices corresponding to the best model for each text. """ best_model_labels = [] for i, text in enumerate(texts): model_results = [] for model_name, model in models.items(): summary, elapsed_time = model.summarize(text) P, R, F1 = score([summary], [summaries[i]], lang="en", verbose=False) f1_score = F1.item() model_results.append((model_name, f1_score, elapsed_time)) model_results.sort(key=lambda x: (-x[1], x[2])) # Select best model based on tolerance rule best_model, best_score, best_time = model_results[0] for model_name, f1_score, elapsed_time in model_results[1:]: if best_score - f1_score <= tolerance and elapsed_time < best_time: best_model, best_score, best_time = model_name, f1_score, elapsed_time best_model_labels.append(best_model) y = np.array([list(models.keys()).index(m) for m in best_model_labels]) return y def extract_features(text): """ Extract linguistic and statistical features from a text. Parameters ---------- text : str The input text. Returns ------- features : dict Dictionary of extracted features: - num_words : int - avg_word_length : float - num_sentences : int - avg_sentence_length : float - avg_syntax_depth : float - num_subordinates : int - num_verbs : int - num_passive : int - type_token_ratio : float - lexical_entropy : float - syllables_per_word : float - complex_words : int - stopword_ratio : float """ doc = nlp(text) num_words = len(doc) avg_word_length = ( np.mean([len(token.text) for token in doc if token.is_alpha]) if num_words > 0 else 0 ) sentences = list(doc.sents) num_sentences = len(sentences) avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0 # Profondeur syntax depths = [token.head.i - token.i for token in doc if token.head != token] avg_syntax_depth = np.mean(depths) if depths else 0 subordinate_conjunctions = { "because", "although", "since", "unless", "whereas", "while", "though", "if", } num_subordinates = sum(1 for token in doc if token.text.lower() in subordinate_conjunctions) num_verbs = sum(1 for token in doc if token.pos_ == "VERB") num_passive = sum(1 for token in doc if token.dep_ == "auxpass") words = [token.text.lower() for token in doc if token.is_alpha] unique_words = set(words) type_token_ratio = len(unique_words) / len(words) if len(words) > 0 else 0 word_freqs = Counter(words) word_probs = np.array(list(word_freqs.values())) / num_words if num_words > 0 else [1] lexical_entropy = entropy(word_probs) syllables_per_word = ( np.mean([textstat.syllable_count(token.text) for token in doc if token.is_alpha]) if num_words > 0 else 0 ) complex_words = sum(1 for token in doc if textstat.syllable_count(token.text) >= 3) stop_words = set(stopwords.words("english")) stopword_ratio = ( sum(1 for word in words if word in stop_words) / num_words if num_words > 0 else 0 ) return { "num_words": num_words, "avg_word_length": avg_word_length, "num_sentences": num_sentences, "avg_sentence_length": avg_sentence_length, "avg_syntax_depth": avg_syntax_depth, "num_subordinates": num_subordinates, "num_verbs": num_verbs, "num_passive": num_passive, "type_token_ratio": type_token_ratio, "lexical_entropy": lexical_entropy, "syllables_per_word": syllables_per_word, "complex_words": complex_words, "stopword_ratio": stopword_ratio, }