Spaces:

tlmk22
/

OptimAbstract

Sleeping

App Files Files Community

tlemagueresse commited on Feb 17

Commit

c53eedd

1 Parent(s): d75519d

First model in WIP

Browse files

Files changed (4) hide show

README.md +0 -12
demo.ipynb +0 -0
model.py +226 -51
requirements.txt +3 -2

README.md CHANGED Viewed

@@ -1,12 +0,0 @@
----
-title: OptimAbstract
-emoji: ⚡
-colorFrom: indigo
-colorTo: pink
-sdk: gradio
-sdk_version: 5.16.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

demo.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

model.py CHANGED Viewed

@@ -1,38 +1,58 @@
 import time
 from collections import Counter
 import numpy as np
 import spacy
-import torch
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from datasets import load_dataset
 from bert_score import score
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
 from scipy.stats import entropy
-def compute_entropy(text):
-    words = text.split()
-    word_freq = Counter(words)
-    probs = np.array(list(word_freq.values())) / sum(word_freq.values())
-    return entropy(probs)
-def compute_syntactic_complexity(text):
-    nlp = spacy.load("en_core_web_sm")
-    doc = nlp(text)
-    depths = [token.head.i - token.i for token in doc if token.head != token]
-    return np.mean(depths) if depths else 0
 class T5Model:
     def __init__(self, model_name):
         self.model_name = model_name
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
     def summarize(self, text):
         inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
         start_time = time.time()
         outputs = self.model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
@@ -42,50 +62,205 @@ class T5Model:
 class MetaModel:
-    def __init__(self, model_names):
-        self.models = {name: T5Model(name) for name in model_names}
-        self.classifier = RandomForestClassifier(n_estimators=100, random_state=42)
-    def extract_features(self, text):
-        words = text.split()
-        num_words = len(words)
-        avg_word_length = np.mean([len(w) for w in words]) if words else 0
-        complexity = compute_syntactic_complexity(text)
-        entropy = compute_entropy(text)
-        return [num_words, avg_word_length, complexity, entropy]
     def fit(self, texts, summaries):
-        X = np.array([self.extract_features(text) for text in texts])
-        best_model_labels = []
-        tolerance = 0.05  # BERTScore tolerance
-        for i, text in enumerate(texts):
-            model_results = []
-            for model_name, model in self.models.items():
-                summary, elapsed_time = model.summarize(text)
-                P, R, F1 = score([summary], [summaries[i]], lang="en", verbose=False)
-                f1_score = F1.item()
-                model_results.append((model_name, f1_score, elapsed_time))
-            # Sort models by BERTScore (desc) and then by time (asc)
-            model_results.sort(key=lambda x: (-x[1], x[2]))
-            # Select best model based on tolerance rule
-            best_model, best_score, best_time = model_results[0]
-            for model_name, f1_score, elapsed_time in model_results[1:]:
-                if best_score - f1_score <= tolerance and elapsed_time < best_time:
-                    best_model, best_score, best_time = model_name, f1_score, elapsed_time
-            best_model_labels.append(best_model)
-        y = np.array([list(self.models.keys()).index(m) for m in best_model_labels])
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-        self.classifier.fit(X_train, y_train)
-    def summarize(self, text):
-        features = np.array([self.extract_features(text)])
-        predicted_model_index = self.classifier.predict(features)[0]
-        predicted_model_name = list(self.models.keys())[predicted_model_index]
-        return self.models[predicted_model_name].summarize(text)

+import pickle
 import time
 from collections import Counter
+from copy import deepcopy
+import nltk
 import numpy as np
 import spacy
+from nltk.corpus import stopwords
+from textstat import textstat
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from bert_score import score
 from scipy.stats import entropy
+nltk.download("punkt")
+nltk.download("averaged_perceptron_tagger")
+nltk.download("stopwords")
+nlp = spacy.load("en_core_web_sm")
 class T5Model:
+    """
+    A class to encapsulate a T5 summarization model.
+    Parameters
+    ----------
+    model_name : str
+        The name of the pretrained T5 model.
+    """
     def __init__(self, model_name):
         self.model_name = model_name
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
     def summarize(self, text):
+        """
+        Generate a summary for the given text.
+        Tokenize -> generate the summary -> decode the text.
+        Parameters
+        ----------
+        text : str
+            The input text to summarize.
+        Returns
+        -------
+        summary : str
+            The generated summary.
+        elapsed_time : float
+            The time taken for summarization in seconds.
+        """
         inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
         start_time = time.time()
         outputs = self.model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
 class MetaModel:
+    """
+    A meta model that selects the best T5Model based on extracted features and a base classifier.
+    Parameters
+    ----------
+    model_names : list of str
+        List of pretrained T5 model names.
+    base_classifier : object
+        A classifier instance used to predict the best model.
+    tolerance : float, optional
+        Tolerance threshold for model selection (default is 0.01).
+    """
+    def __init__(self, model_names, base_classifier, tolerance=0.01):
+        self.models = {name: T5Model(name) for name in model_names}
+        self.base_classifier = deepcopy(base_classifier)
+        self.tolerance = tolerance
     def fit(self, texts, summaries):
+        """
+        Fit the base classifier using extracted features and best model labels.
+        Parameters
+        ----------
+        texts : list of str
+            List of input texts.
+        summaries : list of str
+            List of reference summaries.
+        """
+        X = np.array([list(extract_features(text).values()) for text in texts])
+        y = get_best_model(self.models, texts, summaries, self.tolerance)
+        self.base_classifier.fit(X, y)
+    def summarize(self, text):
+        """
+        Summarize text using the predicted best model.
+        Parameters
+        ----------
+        text : str
+            The input text to summarize.
+        Returns
+        -------
+        summary : str
+            The generated summary.
+        elapsed_time : float
+            The time taken for summarization in seconds.
+        """
+        features = np.array(list(extract_features(text).values()))[np.newaxis, :]
+        predicted_model_index = self.base_classifier.predict(features)[0]
+        predicted_model_name = list(self.models.keys())[predicted_model_index]
+        return self.models[predicted_model_name].summarize(text)
+def save_object(obj, filename):
+    with open(filename, "wb") as f:
+        pickle.dump(obj, f)
+def load_object(filename):
+    with open(filename, "rb") as f:
+        return pickle.load(f)
+def get_best_model(models, texts, summaries, tolerance):
+    """
+    Determine the best model for each text based on BERTScore and summarization time.
+    Parameters
+    ----------
+    models : dict
+        Dictionary mapping model names to T5Model instances.
+    texts : list of str
+        List of input texts.
+    summaries : list of str
+        List of reference summaries.
+    tolerance : float
+        Tolerance threshold for model selection.
+    Returns
+    -------
+    y : np.ndarray
+        Array of indices corresponding to the best model for each text.
+    """
+    best_model_labels = []
+    for i, text in enumerate(texts):
+        model_results = []
+        for model_name, model in models.items():
+            summary, elapsed_time = model.summarize(text)
+            P, R, F1 = score([summary], [summaries[i]], lang="en", verbose=False)
+            f1_score = F1.item()
+            model_results.append((model_name, f1_score, elapsed_time))
+        model_results.sort(key=lambda x: (-x[1], x[2]))
+        # Select best model based on tolerance rule
+        best_model, best_score, best_time = model_results[0]
+        for model_name, f1_score, elapsed_time in model_results[1:]:
+            if best_score - f1_score <= tolerance and elapsed_time < best_time:
+                best_model, best_score, best_time = model_name, f1_score, elapsed_time
+        best_model_labels.append(best_model)
+    y = np.array([list(models.keys()).index(m) for m in best_model_labels])
+    return y
+def extract_features(text):
+    """
+    Extract linguistic and statistical features from a text.
+    Parameters
+    ----------
+    text : str
+        The input text.
+    Returns
+    -------
+    features : dict
+        Dictionary of extracted features:
+            - num_words : int
+            - avg_word_length : float
+            - num_sentences : int
+            - avg_sentence_length : float
+            - avg_syntax_depth : float
+            - num_subordinates : int
+            - num_verbs : int
+            - num_passive : int
+            - type_token_ratio : float
+            - lexical_entropy : float
+            - syllables_per_word : float
+            - complex_words : int
+            - stopword_ratio : float
+    """
+    doc = nlp(text)
+    num_words = len(doc)
+    avg_word_length = (
+        np.mean([len(token.text) for token in doc if token.is_alpha]) if num_words > 0 else 0
+    )
+    sentences = list(doc.sents)
+    num_sentences = len(sentences)
+    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
+    # Profondeur syntax
+    depths = [token.head.i - token.i for token in doc if token.head != token]
+    avg_syntax_depth = np.mean(depths) if depths else 0
+    subordinate_conjunctions = {
+        "because",
+        "although",
+        "since",
+        "unless",
+        "whereas",
+        "while",
+        "though",
+        "if",
+    }
+    num_subordinates = sum(1 for token in doc if token.text.lower() in subordinate_conjunctions)
+    num_verbs = sum(1 for token in doc if token.pos_ == "VERB")
+    num_passive = sum(1 for token in doc if token.dep_ == "auxpass")
+    words = [token.text.lower() for token in doc if token.is_alpha]
+    unique_words = set(words)
+    type_token_ratio = len(unique_words) / len(words) if len(words) > 0 else 0
+    word_freqs = Counter(words)
+    word_probs = np.array(list(word_freqs.values())) / num_words if num_words > 0 else [1]
+    lexical_entropy = entropy(word_probs)
+    syllables_per_word = (
+        np.mean([textstat.syllable_count(token.text) for token in doc if token.is_alpha])
+        if num_words > 0
+        else 0
+    )
+    complex_words = sum(1 for token in doc if textstat.syllable_count(token.text) >= 3)
+    stop_words = set(stopwords.words("english"))
+    stopword_ratio = (
+        sum(1 for word in words if word in stop_words) / num_words if num_words > 0 else 0
+    )
+    return {
+        "num_words": num_words,
+        "avg_word_length": avg_word_length,
+        "num_sentences": num_sentences,
+        "avg_sentence_length": avg_sentence_length,
+        "avg_syntax_depth": avg_syntax_depth,
+        "num_subordinates": num_subordinates,
+        "num_verbs": num_verbs,
+        "num_passive": num_passive,
+        "type_token_ratio": type_token_ratio,
+        "lexical_entropy": lexical_entropy,
+        "syllables_per_word": syllables_per_word,
+        "complex_words": complex_words,
+        "stopword_ratio": stopword_ratio,
+    }

requirements.txt CHANGED Viewed

@@ -6,5 +6,6 @@ numpy
 scipy
 rouge_score
 bert_score
-ipywidgets
-scikit-learn

 scipy
 rouge_score
 bert_score
+scikit-learn
+nltk
+textstat