Spaces:
Sleeping
Sleeping
import pickle | |
import time | |
from collections import Counter | |
from copy import deepcopy | |
import nltk | |
import numpy as np | |
import spacy | |
from nltk.corpus import stopwords | |
from textstat import textstat | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from bert_score import score | |
from scipy.stats import entropy | |
nltk.download("punkt") | |
nltk.download("averaged_perceptron_tagger") | |
nltk.download("stopwords") | |
nlp = spacy.load("en_core_web_sm") | |
class T5Model: | |
""" | |
A class to encapsulate a T5 summarization model. | |
Parameters | |
---------- | |
model_name : str | |
The name of the pretrained T5 model. | |
""" | |
def __init__(self, model_name): | |
self.model_name = model_name | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
def summarize(self, text): | |
""" | |
Generate a summary for the given text. | |
Tokenize -> generate the summary -> decode the text. | |
Parameters | |
---------- | |
text : str | |
The input text to summarize. | |
Returns | |
------- | |
summary : str | |
The generated summary. | |
elapsed_time : float | |
The time taken for summarization in seconds. | |
""" | |
inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True) | |
start_time = time.time() | |
outputs = self.model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True) | |
end_time = time.time() | |
summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return summary, end_time - start_time | |
class MetaModel: | |
""" | |
A meta model that selects the best T5Model based on extracted features and a base classifier. | |
Parameters | |
---------- | |
model_names : list of str | |
List of pretrained T5 model names. | |
base_classifier : object | |
A classifier instance used to predict the best model. | |
tolerance : float, optional | |
Tolerance threshold for model selection (default is 0.01). | |
""" | |
def __init__(self, model_names, base_classifier, tolerance=0.01): | |
self.models = {name: T5Model(name) for name in model_names} | |
self.base_classifier = deepcopy(base_classifier) | |
self.tolerance = tolerance | |
def fit(self, texts, summaries): | |
""" | |
Fit the base classifier using extracted features and best model labels. | |
Parameters | |
---------- | |
texts : list of str | |
List of input texts. | |
summaries : list of str | |
List of reference summaries. | |
""" | |
X = np.array([list(extract_features(text).values()) for text in texts]) | |
y = get_best_model(self.models, texts, summaries, self.tolerance) | |
self.base_classifier.fit(X, y) | |
def summarize(self, text): | |
""" | |
Summarize text using the predicted best model. | |
Parameters | |
---------- | |
text : str | |
The input text to summarize. | |
Returns | |
------- | |
summary : str | |
The generated summary. | |
elapsed_time : float | |
The time taken for summarization in seconds. | |
""" | |
features = np.array(list(extract_features(text).values()))[np.newaxis, :] | |
predicted_model_index = self.base_classifier.predict(features)[0] | |
predicted_model_name = list(self.models.keys())[predicted_model_index] | |
return self.models[predicted_model_name].summarize(text) | |
def save_object(obj, filename): | |
with open(filename, "wb") as f: | |
pickle.dump(obj, f) | |
def load_object(filename): | |
with open(filename, "rb") as f: | |
return pickle.load(f) | |
def get_best_model(models, texts, summaries, tolerance): | |
""" | |
Determine the best model for each text based on BERTScore and summarization time. | |
Parameters | |
---------- | |
models : dict | |
Dictionary mapping model names to T5Model instances. | |
texts : list of str | |
List of input texts. | |
summaries : list of str | |
List of reference summaries. | |
tolerance : float | |
Tolerance threshold for model selection. | |
Returns | |
------- | |
y : np.ndarray | |
Array of indices corresponding to the best model for each text. | |
""" | |
best_model_labels = [] | |
for i, text in enumerate(texts): | |
model_results = [] | |
for model_name, model in models.items(): | |
summary, elapsed_time = model.summarize(text) | |
P, R, F1 = score([summary], [summaries[i]], lang="en", verbose=False) | |
f1_score = F1.item() | |
model_results.append((model_name, f1_score, elapsed_time)) | |
model_results.sort(key=lambda x: (-x[1], x[2])) | |
# Select best model based on tolerance rule | |
best_model, best_score, best_time = model_results[0] | |
for model_name, f1_score, elapsed_time in model_results[1:]: | |
if best_score - f1_score <= tolerance and elapsed_time < best_time: | |
best_model, best_score, best_time = model_name, f1_score, elapsed_time | |
best_model_labels.append(best_model) | |
y = np.array([list(models.keys()).index(m) for m in best_model_labels]) | |
return y | |
def extract_features(text): | |
""" | |
Extract linguistic and statistical features from a text. | |
Parameters | |
---------- | |
text : str | |
The input text. | |
Returns | |
------- | |
features : dict | |
Dictionary of extracted features: | |
- num_words : int | |
- avg_word_length : float | |
- num_sentences : int | |
- avg_sentence_length : float | |
- avg_syntax_depth : float | |
- num_subordinates : int | |
- num_verbs : int | |
- num_passive : int | |
- type_token_ratio : float | |
- lexical_entropy : float | |
- syllables_per_word : float | |
- complex_words : int | |
- stopword_ratio : float | |
""" | |
doc = nlp(text) | |
num_words = len(doc) | |
avg_word_length = ( | |
np.mean([len(token.text) for token in doc if token.is_alpha]) if num_words > 0 else 0 | |
) | |
sentences = list(doc.sents) | |
num_sentences = len(sentences) | |
avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0 | |
# Profondeur syntax | |
depths = [token.head.i - token.i for token in doc if token.head != token] | |
avg_syntax_depth = np.mean(depths) if depths else 0 | |
subordinate_conjunctions = { | |
"because", | |
"although", | |
"since", | |
"unless", | |
"whereas", | |
"while", | |
"though", | |
"if", | |
} | |
num_subordinates = sum(1 for token in doc if token.text.lower() in subordinate_conjunctions) | |
num_verbs = sum(1 for token in doc if token.pos_ == "VERB") | |
num_passive = sum(1 for token in doc if token.dep_ == "auxpass") | |
words = [token.text.lower() for token in doc if token.is_alpha] | |
unique_words = set(words) | |
type_token_ratio = len(unique_words) / len(words) if len(words) > 0 else 0 | |
word_freqs = Counter(words) | |
word_probs = np.array(list(word_freqs.values())) / num_words if num_words > 0 else [1] | |
lexical_entropy = entropy(word_probs) | |
syllables_per_word = ( | |
np.mean([textstat.syllable_count(token.text) for token in doc if token.is_alpha]) | |
if num_words > 0 | |
else 0 | |
) | |
complex_words = sum(1 for token in doc if textstat.syllable_count(token.text) >= 3) | |
stop_words = set(stopwords.words("english")) | |
stopword_ratio = ( | |
sum(1 for word in words if word in stop_words) / num_words if num_words > 0 else 0 | |
) | |
return { | |
"num_words": num_words, | |
"avg_word_length": avg_word_length, | |
"num_sentences": num_sentences, | |
"avg_sentence_length": avg_sentence_length, | |
"avg_syntax_depth": avg_syntax_depth, | |
"num_subordinates": num_subordinates, | |
"num_verbs": num_verbs, | |
"num_passive": num_passive, | |
"type_token_ratio": type_token_ratio, | |
"lexical_entropy": lexical_entropy, | |
"syllables_per_word": syllables_per_word, | |
"complex_words": complex_words, | |
"stopword_ratio": stopword_ratio, | |
} | |