tlemagueresse commited on
Commit
c53eedd
·
1 Parent(s): d75519d

First model in WIP

Browse files
Files changed (4) hide show
  1. README.md +0 -12
  2. demo.ipynb +0 -0
  3. model.py +226 -51
  4. requirements.txt +3 -2
README.md CHANGED
@@ -1,12 +0,0 @@
1
- ---
2
- title: OptimAbstract
3
- emoji: ⚡
4
- colorFrom: indigo
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.16.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
demo.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
model.py CHANGED
@@ -1,38 +1,58 @@
 
1
  import time
2
  from collections import Counter
 
3
 
 
4
  import numpy as np
5
  import spacy
6
- import torch
 
7
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
8
- from datasets import load_dataset
9
  from bert_score import score
10
- from sklearn.ensemble import RandomForestClassifier
11
- from sklearn.model_selection import train_test_split
12
- from sklearn.preprocessing import StandardScaler
13
  from scipy.stats import entropy
14
 
15
- def compute_entropy(text):
16
- words = text.split()
17
- word_freq = Counter(words)
18
- probs = np.array(list(word_freq.values())) / sum(word_freq.values())
19
- return entropy(probs)
20
 
 
 
 
21
 
22
- def compute_syntactic_complexity(text):
23
- nlp = spacy.load("en_core_web_sm")
24
- doc = nlp(text)
25
- depths = [token.head.i - token.i for token in doc if token.head != token]
26
- return np.mean(depths) if depths else 0
27
 
28
 
29
  class T5Model:
 
 
 
 
 
 
 
 
 
30
  def __init__(self, model_name):
31
  self.model_name = model_name
32
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
33
  self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
34
 
35
  def summarize(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
37
  start_time = time.time()
38
  outputs = self.model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
@@ -42,50 +62,205 @@ class T5Model:
42
 
43
 
44
  class MetaModel:
45
- def __init__(self, model_names):
46
- self.models = {name: T5Model(name) for name in model_names}
47
- self.classifier = RandomForestClassifier(n_estimators=100, random_state=42)
48
 
49
- def extract_features(self, text):
50
- words = text.split()
51
- num_words = len(words)
52
- avg_word_length = np.mean([len(w) for w in words]) if words else 0
53
- complexity = compute_syntactic_complexity(text)
54
- entropy = compute_entropy(text)
55
- return [num_words, avg_word_length, complexity, entropy]
 
 
 
 
 
 
 
56
 
57
  def fit(self, texts, summaries):
58
- X = np.array([self.extract_features(text) for text in texts])
 
59
 
60
- best_model_labels = []
61
- tolerance = 0.05 # BERTScore tolerance
 
 
 
 
 
 
 
 
62
 
63
- for i, text in enumerate(texts):
64
- model_results = []
65
- for model_name, model in self.models.items():
66
- summary, elapsed_time = model.summarize(text)
67
- P, R, F1 = score([summary], [summaries[i]], lang="en", verbose=False)
68
- f1_score = F1.item()
69
- model_results.append((model_name, f1_score, elapsed_time))
70
 
71
- # Sort models by BERTScore (desc) and then by time (asc)
72
- model_results.sort(key=lambda x: (-x[1], x[2]))
 
 
73
 
74
- # Select best model based on tolerance rule
75
- best_model, best_score, best_time = model_results[0]
76
- for model_name, f1_score, elapsed_time in model_results[1:]:
77
- if best_score - f1_score <= tolerance and elapsed_time < best_time:
78
- best_model, best_score, best_time = model_name, f1_score, elapsed_time
 
 
 
 
 
 
79
 
80
- best_model_labels.append(best_model)
81
 
82
- y = np.array([list(self.models.keys()).index(m) for m in best_model_labels])
83
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
84
- self.classifier.fit(X_train, y_train)
85
 
86
- def summarize(self, text):
87
- features = np.array([self.extract_features(text)])
88
- predicted_model_index = self.classifier.predict(features)[0]
89
- predicted_model_name = list(self.models.keys())[predicted_model_index]
90
- return self.models[predicted_model_name].summarize(text)
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
  import time
3
  from collections import Counter
4
+ from copy import deepcopy
5
 
6
+ import nltk
7
  import numpy as np
8
  import spacy
9
+ from nltk.corpus import stopwords
10
+ from textstat import textstat
11
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 
12
  from bert_score import score
 
 
 
13
  from scipy.stats import entropy
14
 
 
 
 
 
 
15
 
16
+ nltk.download("punkt")
17
+ nltk.download("averaged_perceptron_tagger")
18
+ nltk.download("stopwords")
19
 
20
+ nlp = spacy.load("en_core_web_sm")
 
 
 
 
21
 
22
 
23
  class T5Model:
24
+ """
25
+ A class to encapsulate a T5 summarization model.
26
+
27
+ Parameters
28
+ ----------
29
+ model_name : str
30
+ The name of the pretrained T5 model.
31
+ """
32
+
33
  def __init__(self, model_name):
34
  self.model_name = model_name
35
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
36
  self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
37
 
38
  def summarize(self, text):
39
+ """
40
+ Generate a summary for the given text.
41
+
42
+ Tokenize -> generate the summary -> decode the text.
43
+
44
+ Parameters
45
+ ----------
46
+ text : str
47
+ The input text to summarize.
48
+
49
+ Returns
50
+ -------
51
+ summary : str
52
+ The generated summary.
53
+ elapsed_time : float
54
+ The time taken for summarization in seconds.
55
+ """
56
  inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
57
  start_time = time.time()
58
  outputs = self.model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
 
62
 
63
 
64
  class MetaModel:
65
+ """
66
+ A meta model that selects the best T5Model based on extracted features and a base classifier.
 
67
 
68
+ Parameters
69
+ ----------
70
+ model_names : list of str
71
+ List of pretrained T5 model names.
72
+ base_classifier : object
73
+ A classifier instance used to predict the best model.
74
+ tolerance : float, optional
75
+ Tolerance threshold for model selection (default is 0.01).
76
+ """
77
+
78
+ def __init__(self, model_names, base_classifier, tolerance=0.01):
79
+ self.models = {name: T5Model(name) for name in model_names}
80
+ self.base_classifier = deepcopy(base_classifier)
81
+ self.tolerance = tolerance
82
 
83
  def fit(self, texts, summaries):
84
+ """
85
+ Fit the base classifier using extracted features and best model labels.
86
 
87
+ Parameters
88
+ ----------
89
+ texts : list of str
90
+ List of input texts.
91
+ summaries : list of str
92
+ List of reference summaries.
93
+ """
94
+ X = np.array([list(extract_features(text).values()) for text in texts])
95
+ y = get_best_model(self.models, texts, summaries, self.tolerance)
96
+ self.base_classifier.fit(X, y)
97
 
98
+ def summarize(self, text):
99
+ """
100
+ Summarize text using the predicted best model.
 
 
 
 
101
 
102
+ Parameters
103
+ ----------
104
+ text : str
105
+ The input text to summarize.
106
 
107
+ Returns
108
+ -------
109
+ summary : str
110
+ The generated summary.
111
+ elapsed_time : float
112
+ The time taken for summarization in seconds.
113
+ """
114
+ features = np.array(list(extract_features(text).values()))[np.newaxis, :]
115
+ predicted_model_index = self.base_classifier.predict(features)[0]
116
+ predicted_model_name = list(self.models.keys())[predicted_model_index]
117
+ return self.models[predicted_model_name].summarize(text)
118
 
 
119
 
120
+ def save_object(obj, filename):
121
+ with open(filename, "wb") as f:
122
+ pickle.dump(obj, f)
123
 
 
 
 
 
 
124
 
125
+ def load_object(filename):
126
+ with open(filename, "rb") as f:
127
+ return pickle.load(f)
128
+
129
+
130
+ def get_best_model(models, texts, summaries, tolerance):
131
+ """
132
+ Determine the best model for each text based on BERTScore and summarization time.
133
+
134
+ Parameters
135
+ ----------
136
+ models : dict
137
+ Dictionary mapping model names to T5Model instances.
138
+ texts : list of str
139
+ List of input texts.
140
+ summaries : list of str
141
+ List of reference summaries.
142
+ tolerance : float
143
+ Tolerance threshold for model selection.
144
+
145
+ Returns
146
+ -------
147
+ y : np.ndarray
148
+ Array of indices corresponding to the best model for each text.
149
+ """
150
+ best_model_labels = []
151
+
152
+ for i, text in enumerate(texts):
153
+ model_results = []
154
+ for model_name, model in models.items():
155
+ summary, elapsed_time = model.summarize(text)
156
+ P, R, F1 = score([summary], [summaries[i]], lang="en", verbose=False)
157
+ f1_score = F1.item()
158
+ model_results.append((model_name, f1_score, elapsed_time))
159
+
160
+ model_results.sort(key=lambda x: (-x[1], x[2]))
161
+
162
+ # Select best model based on tolerance rule
163
+ best_model, best_score, best_time = model_results[0]
164
+ for model_name, f1_score, elapsed_time in model_results[1:]:
165
+ if best_score - f1_score <= tolerance and elapsed_time < best_time:
166
+ best_model, best_score, best_time = model_name, f1_score, elapsed_time
167
+
168
+ best_model_labels.append(best_model)
169
+
170
+ y = np.array([list(models.keys()).index(m) for m in best_model_labels])
171
+
172
+ return y
173
+
174
+
175
+ def extract_features(text):
176
+ """
177
+ Extract linguistic and statistical features from a text.
178
+
179
+ Parameters
180
+ ----------
181
+ text : str
182
+ The input text.
183
+
184
+ Returns
185
+ -------
186
+ features : dict
187
+ Dictionary of extracted features:
188
+ - num_words : int
189
+ - avg_word_length : float
190
+ - num_sentences : int
191
+ - avg_sentence_length : float
192
+ - avg_syntax_depth : float
193
+ - num_subordinates : int
194
+ - num_verbs : int
195
+ - num_passive : int
196
+ - type_token_ratio : float
197
+ - lexical_entropy : float
198
+ - syllables_per_word : float
199
+ - complex_words : int
200
+ - stopword_ratio : float
201
+ """
202
+ doc = nlp(text)
203
+
204
+ num_words = len(doc)
205
+ avg_word_length = (
206
+ np.mean([len(token.text) for token in doc if token.is_alpha]) if num_words > 0 else 0
207
+ )
208
+
209
+ sentences = list(doc.sents)
210
+ num_sentences = len(sentences)
211
+ avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
212
+
213
+ # Profondeur syntax
214
+ depths = [token.head.i - token.i for token in doc if token.head != token]
215
+ avg_syntax_depth = np.mean(depths) if depths else 0
216
+
217
+ subordinate_conjunctions = {
218
+ "because",
219
+ "although",
220
+ "since",
221
+ "unless",
222
+ "whereas",
223
+ "while",
224
+ "though",
225
+ "if",
226
+ }
227
+ num_subordinates = sum(1 for token in doc if token.text.lower() in subordinate_conjunctions)
228
+
229
+ num_verbs = sum(1 for token in doc if token.pos_ == "VERB")
230
+ num_passive = sum(1 for token in doc if token.dep_ == "auxpass")
231
+
232
+ words = [token.text.lower() for token in doc if token.is_alpha]
233
+ unique_words = set(words)
234
+ type_token_ratio = len(unique_words) / len(words) if len(words) > 0 else 0
235
+
236
+ word_freqs = Counter(words)
237
+ word_probs = np.array(list(word_freqs.values())) / num_words if num_words > 0 else [1]
238
+ lexical_entropy = entropy(word_probs)
239
+
240
+ syllables_per_word = (
241
+ np.mean([textstat.syllable_count(token.text) for token in doc if token.is_alpha])
242
+ if num_words > 0
243
+ else 0
244
+ )
245
+ complex_words = sum(1 for token in doc if textstat.syllable_count(token.text) >= 3)
246
+
247
+ stop_words = set(stopwords.words("english"))
248
+ stopword_ratio = (
249
+ sum(1 for word in words if word in stop_words) / num_words if num_words > 0 else 0
250
+ )
251
+
252
+ return {
253
+ "num_words": num_words,
254
+ "avg_word_length": avg_word_length,
255
+ "num_sentences": num_sentences,
256
+ "avg_sentence_length": avg_sentence_length,
257
+ "avg_syntax_depth": avg_syntax_depth,
258
+ "num_subordinates": num_subordinates,
259
+ "num_verbs": num_verbs,
260
+ "num_passive": num_passive,
261
+ "type_token_ratio": type_token_ratio,
262
+ "lexical_entropy": lexical_entropy,
263
+ "syllables_per_word": syllables_per_word,
264
+ "complex_words": complex_words,
265
+ "stopword_ratio": stopword_ratio,
266
+ }
requirements.txt CHANGED
@@ -6,5 +6,6 @@ numpy
6
  scipy
7
  rouge_score
8
  bert_score
9
- ipywidgets
10
- scikit-learn
 
 
6
  scipy
7
  rouge_score
8
  bert_score
9
+ scikit-learn
10
+ nltk
11
+ textstat