piecurus commited on
Commit
7215c40
1 Parent(s): a4331bf

extractive

Browse files
extractive_summarizer/bert_parent.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ import torch
4
+ import streamlit as st
5
+ import numpy as np
6
+ from numpy import ndarray
7
+ from transformers import (AlbertModel, AlbertTokenizer, BertModel,
8
+ BertTokenizer, DistilBertModel, DistilBertTokenizer,
9
+ PreTrainedModel, PreTrainedTokenizer, XLMModel,
10
+ XLMTokenizer, XLNetModel, XLNetTokenizer)
11
+
12
+ @st.cache()
13
+ def load_hf_model(base_model, model_name, device):
14
+ model = base_model.from_pretrained(model_name, output_hidden_states=True).to(device)
15
+ return model
16
+
17
+ class BertParent(object):
18
+ """
19
+ Base handler for BERT models.
20
+ """
21
+
22
+ MODELS = {
23
+ 'bert-base-uncased': (BertModel, BertTokenizer),
24
+ 'bert-large-uncased': (BertModel, BertTokenizer),
25
+ 'xlnet-base-cased': (XLNetModel, XLNetTokenizer),
26
+ 'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer),
27
+ 'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer),
28
+ 'albert-base-v1': (AlbertModel, AlbertTokenizer),
29
+ 'albert-large-v1': (AlbertModel, AlbertTokenizer)
30
+ }
31
+
32
+ def __init__(
33
+ self,
34
+ model: str,
35
+ custom_model: PreTrainedModel = None,
36
+ custom_tokenizer: PreTrainedTokenizer = None,
37
+ gpu_id: int = 0,
38
+ ):
39
+ """
40
+ :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
41
+ :param custom_model: This is optional if a custom bert model is used.
42
+ :param custom_tokenizer: Place to use custom tokenizer.
43
+ """
44
+ base_model, base_tokenizer = self.MODELS.get(model, (None, None))
45
+
46
+ self.device = torch.device("cpu")
47
+ if torch.cuda.is_available():
48
+ assert (
49
+ isinstance(gpu_id, int) and (0 <= gpu_id and gpu_id < torch.cuda.device_count())
50
+ ), f"`gpu_id` must be an integer between 0 to {torch.cuda.device_count() - 1}. But got: {gpu_id}"
51
+
52
+ self.device = torch.device(f"cuda:{gpu_id}")
53
+
54
+ if custom_model:
55
+ self.model = custom_model.to(self.device)
56
+ else:
57
+ # self.model = base_model.from_pretrained(
58
+ # model, output_hidden_states=True).to(self.device)
59
+ self.model = load_hf_model(base_model, model, self.device)
60
+
61
+ if custom_tokenizer:
62
+ self.tokenizer = custom_tokenizer
63
+ else:
64
+ self.tokenizer = base_tokenizer.from_pretrained(model)
65
+
66
+ self.model.eval()
67
+
68
+
69
+ def tokenize_input(self, text: str) -> torch.tensor:
70
+ """
71
+ Tokenizes the text input.
72
+ :param text: Text to tokenize.
73
+ :return: Returns a torch tensor.
74
+ """
75
+ tokenized_text = self.tokenizer.tokenize(text)
76
+ indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
77
+ return torch.tensor([indexed_tokens]).to(self.device)
78
+
79
+ def _pooled_handler(self, hidden: torch.Tensor,
80
+ reduce_option: str) -> torch.Tensor:
81
+ """
82
+ Handles torch tensor.
83
+ :param hidden: The hidden torch tensor to process.
84
+ :param reduce_option: The reduce option to use, such as mean, etc.
85
+ :return: Returns a torch tensor.
86
+ """
87
+
88
+ if reduce_option == 'max':
89
+ return hidden.max(dim=1)[0].squeeze()
90
+
91
+ elif reduce_option == 'median':
92
+ return hidden.median(dim=1)[0].squeeze()
93
+
94
+ return hidden.mean(dim=1).squeeze()
95
+
96
+ def extract_embeddings(
97
+ self,
98
+ text: str,
99
+ hidden: Union[List[int], int] = -2,
100
+ reduce_option: str = 'mean',
101
+ hidden_concat: bool = False,
102
+ ) -> torch.Tensor:
103
+ """
104
+ Extracts the embeddings for the given text.
105
+ :param text: The text to extract embeddings for.
106
+ :param hidden: The hidden layer(s) to use for a readout handler.
107
+ :param squeeze: If we should squeeze the outputs (required for some layers).
108
+ :param reduce_option: How we should reduce the items.
109
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
110
+ :return: A torch vector.
111
+ """
112
+ tokens_tensor = self.tokenize_input(text)
113
+ pooled, hidden_states = self.model(tokens_tensor)[-2:]
114
+
115
+ # deprecated temporary keyword functions.
116
+ if reduce_option == 'concat_last_4':
117
+ last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
118
+ cat_hidden_states = torch.cat(tuple(last_4), dim=-1)
119
+ return torch.mean(cat_hidden_states, dim=1).squeeze()
120
+
121
+ elif reduce_option == 'reduce_last_4':
122
+ last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
123
+ return torch.cat(tuple(last_4), dim=1).mean(axis=1).squeeze()
124
+
125
+ elif type(hidden) == int:
126
+ hidden_s = hidden_states[hidden]
127
+ return self._pooled_handler(hidden_s, reduce_option)
128
+
129
+ elif hidden_concat:
130
+ last_states = [hidden_states[i] for i in hidden]
131
+ cat_hidden_states = torch.cat(tuple(last_states), dim=-1)
132
+ return torch.mean(cat_hidden_states, dim=1).squeeze()
133
+
134
+ last_states = [hidden_states[i] for i in hidden]
135
+ hidden_s = torch.cat(tuple(last_states), dim=1)
136
+
137
+ return self._pooled_handler(hidden_s, reduce_option)
138
+
139
+ def create_matrix(
140
+ self,
141
+ content: List[str],
142
+ hidden: Union[List[int], int] = -2,
143
+ reduce_option: str = 'mean',
144
+ hidden_concat: bool = False,
145
+ ) -> ndarray:
146
+ """
147
+ Create matrix from the embeddings.
148
+ :param content: The list of sentences.
149
+ :param hidden: Which hidden layer to use.
150
+ :param reduce_option: The reduce option to run.
151
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
152
+ :return: A numpy array matrix of the given content.
153
+ """
154
+
155
+ return np.asarray([
156
+ np.squeeze(self.extract_embeddings(
157
+ t, hidden=hidden, reduce_option=reduce_option, hidden_concat=hidden_concat
158
+ ).data.cpu().numpy()) for t in content
159
+ ])
160
+
161
+ def __call__(
162
+ self,
163
+ content: List[str],
164
+ hidden: int = -2,
165
+ reduce_option: str = 'mean',
166
+ hidden_concat: bool = False,
167
+ ) -> ndarray:
168
+ """
169
+ Create matrix from the embeddings.
170
+ :param content: The list of sentences.
171
+ :param hidden: Which hidden layer to use.
172
+ :param reduce_option: The reduce option to run.
173
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
174
+ :return: A numpy array matrix of the given content.
175
+ """
176
+ return self.create_matrix(content, hidden, reduce_option, hidden_concat)
extractive_summarizer/cluster_features.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import numpy as np
4
+ from numpy import ndarray
5
+ from sklearn.cluster import KMeans
6
+ from sklearn.decomposition import PCA
7
+ from sklearn.mixture import GaussianMixture
8
+
9
+
10
+ class ClusterFeatures(object):
11
+ """
12
+ Basic handling of clustering features.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ features: ndarray,
18
+ algorithm: str = 'kmeans',
19
+ pca_k: int = None,
20
+ random_state: int = 12345,
21
+ ):
22
+ """
23
+ :param features: the embedding matrix created by bert parent.
24
+ :param algorithm: Which clustering algorithm to use.
25
+ :param pca_k: If you want the features to be ran through pca, this is the components number.
26
+ :param random_state: Random state.
27
+ """
28
+ if pca_k:
29
+ self.features = PCA(n_components=pca_k).fit_transform(features)
30
+ else:
31
+ self.features = features
32
+
33
+ self.algorithm = algorithm
34
+ self.pca_k = pca_k
35
+ self.random_state = random_state
36
+
37
+ def __get_model(self, k: int):
38
+ """
39
+ Retrieve clustering model.
40
+
41
+ :param k: amount of clusters.
42
+ :return: Clustering model.
43
+ """
44
+
45
+ if self.algorithm == 'gmm':
46
+ return GaussianMixture(n_components=k, random_state=self.random_state)
47
+ return KMeans(n_clusters=k, random_state=self.random_state)
48
+
49
+ def __get_centroids(self, model):
50
+ """
51
+ Retrieve centroids of model.
52
+
53
+ :param model: Clustering model.
54
+ :return: Centroids.
55
+ """
56
+ if self.algorithm == 'gmm':
57
+ return model.means_
58
+ return model.cluster_centers_
59
+
60
+ def __find_closest_args(self, centroids: np.ndarray) -> Dict:
61
+ """
62
+ Find the closest arguments to centroid.
63
+
64
+ :param centroids: Centroids to find closest.
65
+ :return: Closest arguments.
66
+ """
67
+ centroid_min = 1e10
68
+ cur_arg = -1
69
+ args = {}
70
+ used_idx = []
71
+
72
+ for j, centroid in enumerate(centroids):
73
+
74
+ for i, feature in enumerate(self.features):
75
+ value = np.linalg.norm(feature - centroid)
76
+
77
+ if value < centroid_min and i not in used_idx:
78
+ cur_arg = i
79
+ centroid_min = value
80
+
81
+ used_idx.append(cur_arg)
82
+ args[j] = cur_arg
83
+ centroid_min = 1e10
84
+ cur_arg = -1
85
+
86
+ return args
87
+
88
+ def calculate_elbow(self, k_max: int) -> List[float]:
89
+ """
90
+ Calculates elbow up to the provided k_max.
91
+
92
+ :param k_max: K_max to calculate elbow for.
93
+ :return: The inertias up to k_max.
94
+ """
95
+ inertias = []
96
+
97
+ for k in range(1, min(k_max, len(self.features))):
98
+ model = self.__get_model(k).fit(self.features)
99
+
100
+ inertias.append(model.inertia_)
101
+
102
+ return inertias
103
+
104
+ def calculate_optimal_cluster(self, k_max: int):
105
+ """
106
+ Calculates the optimal cluster based on Elbow.
107
+
108
+ :param k_max: The max k to search elbow for.
109
+ :return: The optimal cluster size.
110
+ """
111
+ delta_1 = []
112
+ delta_2 = []
113
+
114
+ max_strength = 0
115
+ k = 1
116
+
117
+ inertias = self.calculate_elbow(k_max)
118
+
119
+ for i in range(len(inertias)):
120
+ delta_1.append(inertias[i] - inertias[i - 1] if i > 0 else 0.0)
121
+ delta_2.append(delta_1[i] - delta_1[i - 1] if i > 1 else 0.0)
122
+
123
+ for j in range(len(inertias)):
124
+ strength = 0 if j <= 1 or j == len(inertias) - 1 else delta_2[j + 1] - delta_1[j + 1]
125
+
126
+ if strength > max_strength:
127
+ max_strength = strength
128
+ k = j + 1
129
+
130
+ return k
131
+
132
+ def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
133
+ """
134
+ Clusters sentences based on the ratio.
135
+
136
+ :param ratio: Ratio to use for clustering.
137
+ :param num_sentences: Number of sentences. Overrides ratio.
138
+ :return: Sentences index that qualify for summary.
139
+ """
140
+
141
+ if num_sentences is not None:
142
+ if num_sentences == 0:
143
+ return []
144
+
145
+ k = min(num_sentences, len(self.features))
146
+ else:
147
+ k = max(int(len(self.features) * ratio), 1)
148
+
149
+ model = self.__get_model(k).fit(self.features)
150
+
151
+ centroids = self.__get_centroids(model)
152
+ cluster_args = self.__find_closest_args(centroids)
153
+
154
+ sorted_values = sorted(cluster_args.values())
155
+ return sorted_values
156
+
157
+ def __call__(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
158
+ """
159
+ Clusters sentences based on the ratio.
160
+
161
+ :param ratio: Ratio to use for clustering.
162
+ :param num_sentences: Number of sentences. Overrides ratio.
163
+ :return: Sentences index that qualify for summary.
164
+ """
165
+ return self.cluster(ratio)
extractive_summarizer/model_processors.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+
3
+ import numpy as np
4
+ from transformers import (AlbertModel, AlbertTokenizer, BartModel,
5
+ BartTokenizer, BertModel, BertTokenizer,
6
+ CamembertModel, CamembertTokenizer, CTRLModel,
7
+ CTRLTokenizer, DistilBertModel, DistilBertTokenizer,
8
+ GPT2Model, GPT2Tokenizer, LongformerModel,
9
+ LongformerTokenizer, OpenAIGPTModel,
10
+ OpenAIGPTTokenizer, PreTrainedModel,
11
+ PreTrainedTokenizer, RobertaModel, RobertaTokenizer,
12
+ TransfoXLModel, TransfoXLTokenizer, XLMModel,
13
+ XLMTokenizer, XLNetModel, XLNetTokenizer)
14
+
15
+ from extractive_summarizer.bert_parent import BertParent
16
+ from extractive_summarizer.cluster_features import ClusterFeatures
17
+ from extractive_summarizer.sentence_handler import SentenceHandler
18
+
19
+
20
+ class ModelProcessor(object):
21
+ aggregate_map = {
22
+ 'mean': np.mean,
23
+ 'min': np.min,
24
+ 'median': np.median,
25
+ 'max': np.max,
26
+ }
27
+
28
+ def __init__(
29
+ self,
30
+ model: str = 'bert-large-uncased',
31
+ custom_model: PreTrainedModel = None,
32
+ custom_tokenizer: PreTrainedTokenizer = None,
33
+ hidden: Union[List[int], int] = -2,
34
+ reduce_option: str = 'mean',
35
+ sentence_handler: SentenceHandler = SentenceHandler(),
36
+ random_state: int = 12345,
37
+ hidden_concat: bool = False,
38
+ gpu_id: int = 0,
39
+ ):
40
+ """
41
+ This is the parent Bert Summarizer model. New methods should implement this class.
42
+
43
+ :param model: This parameter is associated with the inherit string parameters from the transformers library.
44
+ :param custom_model: If you have a pre-trained model, you can add the model class here.
45
+ :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
46
+ :param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings.
47
+ :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
48
+ :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass.
49
+ CoreferenceHandler instance
50
+ :param random_state: The random state to reproduce summarizations.
51
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
52
+ :param gpu_id: GPU device index if CUDA is available.
53
+ """
54
+ np.random.seed(random_state)
55
+ self.model = BertParent(model, custom_model, custom_tokenizer, gpu_id)
56
+ self.hidden = hidden
57
+ self.reduce_option = reduce_option
58
+ self.sentence_handler = sentence_handler
59
+ self.random_state = random_state
60
+ self.hidden_concat = hidden_concat
61
+
62
+ def cluster_runner(
63
+ self,
64
+ content: List[str],
65
+ ratio: float = 0.2,
66
+ algorithm: str = 'kmeans',
67
+ use_first: bool = True,
68
+ num_sentences: int = None
69
+ ) -> Tuple[List[str], np.ndarray]:
70
+ """
71
+ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.
72
+
73
+ :param content: Content list of sentences.
74
+ :param ratio: The ratio to use for clustering.
75
+ :param algorithm: Type of algorithm to use for clustering.
76
+ :param use_first: Return the first sentence in the output (helpful for news stories, etc).
77
+ :param num_sentences: Number of sentences to use for summarization.
78
+ :return: A tuple of summarized sentences and embeddings
79
+ """
80
+ if num_sentences is not None:
81
+ num_sentences = num_sentences if use_first else num_sentences
82
+
83
+ hidden = self.model(
84
+ content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat)
85
+ hidden_args = ClusterFeatures(
86
+ hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences)
87
+
88
+ if use_first:
89
+
90
+ if not hidden_args:
91
+ hidden_args.append(0)
92
+
93
+ elif hidden_args[0] != 0:
94
+ hidden_args.insert(0, 0)
95
+
96
+ sentences = [content[j] for j in hidden_args]
97
+ embeddings = np.asarray([hidden[j] for j in hidden_args])
98
+
99
+ return sentences, embeddings
100
+
101
+ def __run_clusters(
102
+ self,
103
+ content: List[str],
104
+ ratio: float = 0.2,
105
+ algorithm: str = 'kmeans',
106
+ use_first: bool = True,
107
+ num_sentences: int = None
108
+ ) -> List[str]:
109
+ """
110
+ Runs clusters and returns sentences.
111
+
112
+ :param content: The content of sentences.
113
+ :param ratio: Ratio to use for for clustering.
114
+ :param algorithm: Algorithm selection for clustering.
115
+ :param use_first: Whether to use first sentence
116
+ :param num_sentences: Number of sentences. Overrides ratio.
117
+ :return: summarized sentences
118
+ """
119
+ sentences, _ = self.cluster_runner(
120
+ content, ratio, algorithm, use_first, num_sentences)
121
+ return sentences
122
+
123
+ def __retrieve_summarized_embeddings(
124
+ self,
125
+ content: List[str],
126
+ ratio: float = 0.2,
127
+ algorithm: str = 'kmeans',
128
+ use_first: bool = True,
129
+ num_sentences: int = None
130
+ ) -> np.ndarray:
131
+ """
132
+ Retrieves embeddings of the summarized sentences.
133
+
134
+ :param content: The content of sentences.
135
+ :param ratio: Ratio to use for for clustering.
136
+ :param algorithm: Algorithm selection for clustering.
137
+ :param use_first: Whether to use first sentence
138
+ :return: Summarized embeddings
139
+ """
140
+ _, embeddings = self.cluster_runner(
141
+ content, ratio, algorithm, use_first, num_sentences)
142
+ return embeddings
143
+
144
+ def calculate_elbow(
145
+ self,
146
+ body: str,
147
+ algorithm: str = 'kmeans',
148
+ min_length: int = 40,
149
+ max_length: int = 600,
150
+ k_max: int = None,
151
+ ) -> List[float]:
152
+ """
153
+ Calculates elbow across the clusters.
154
+
155
+ :param body: The input body to summarize.
156
+ :param algorithm: The algorithm to use for clustering.
157
+ :param min_length: The min length to use.
158
+ :param max_length: The max length to use.
159
+ :param k_max: The maximum number of clusters to search.
160
+ :return: List of elbow inertia values.
161
+ """
162
+ sentences = self.sentence_handler(body, min_length, max_length)
163
+
164
+ if k_max is None:
165
+ k_max = len(sentences) - 1
166
+
167
+ hidden = self.model(sentences, self.hidden,
168
+ self.reduce_option, hidden_concat=self.hidden_concat)
169
+ elbow = ClusterFeatures(
170
+ hidden, algorithm, random_state=self.random_state).calculate_elbow(k_max)
171
+
172
+ return elbow
173
+
174
+ def calculate_optimal_k(
175
+ self,
176
+ body: str,
177
+ algorithm: str = 'kmeans',
178
+ min_length: int = 40,
179
+ max_length: int = 600,
180
+ k_max: int = None,
181
+ ):
182
+ """
183
+ Calculates the optimal Elbow K.
184
+
185
+ :param body: The input body to summarize.
186
+ :param algorithm: The algorithm to use for clustering.
187
+ :param min_length: The min length to use.
188
+ :param max_length: The max length to use.
189
+ :param k_max: The maximum number of clusters to search.
190
+ :return:
191
+ """
192
+ sentences = self.sentence_handler(body, min_length, max_length)
193
+
194
+ if k_max is None:
195
+ k_max = len(sentences) - 1
196
+
197
+ hidden = self.model(sentences, self.hidden,
198
+ self.reduce_option, hidden_concat=self.hidden_concat)
199
+ optimal_k = ClusterFeatures(
200
+ hidden, algorithm, random_state=self.random_state).calculate_optimal_cluster(k_max)
201
+
202
+ return optimal_k
203
+
204
+ def run_embeddings(
205
+ self,
206
+ body: str,
207
+ ratio: float = 0.2,
208
+ min_length: int = 40,
209
+ max_length: int = 600,
210
+ use_first: bool = True,
211
+ algorithm: str = 'kmeans',
212
+ num_sentences: int = None,
213
+ aggregate: str = None,
214
+ ) -> Optional[np.ndarray]:
215
+ """
216
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the embeddings.
217
+
218
+ :param body: The raw string body to process
219
+ :param ratio: Ratio of sentences to use
220
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
221
+ :param max_length: Maximum length of sentence candidates to utilize for the summary
222
+ :param use_first: Whether or not to use the first sentence
223
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
224
+ :param num_sentences: Number of sentences to use. Overrides ratio.
225
+ :param aggregate: One of mean, median, max, min. Applied on zero axis
226
+ :return: A summary embedding
227
+ """
228
+ sentences = self.sentence_handler(body, min_length, max_length)
229
+
230
+ if sentences:
231
+ embeddings = self.__retrieve_summarized_embeddings(
232
+ sentences, ratio, algorithm, use_first, num_sentences)
233
+
234
+ if aggregate is not None:
235
+ assert aggregate in [
236
+ 'mean', 'median', 'max', 'min'], "aggregate must be mean, min, max, or median"
237
+ embeddings = self.aggregate_map[aggregate](embeddings, axis=0)
238
+
239
+ return embeddings
240
+
241
+ return None
242
+
243
+ def run(
244
+ self,
245
+ body: str,
246
+ ratio: float = 0.2,
247
+ min_length: int = 40,
248
+ max_length: int = 600,
249
+ use_first: bool = True,
250
+ algorithm: str = 'kmeans',
251
+ num_sentences: int = None,
252
+ return_as_list: bool = False
253
+ ) -> Union[List, str]:
254
+ """
255
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
256
+
257
+ :param body: The raw string body to process
258
+ :param ratio: Ratio of sentences to use
259
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
260
+ :param max_length: Maximum length of sentence candidates to utilize for the summary
261
+ :param use_first: Whether or not to use the first sentence
262
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
263
+ :param num_sentences: Number of sentences to use (overrides ratio).
264
+ :param return_as_list: Whether or not to return sentences as list.
265
+ :return: A summary sentence
266
+ """
267
+ sentences = self.sentence_handler(body, min_length, max_length)
268
+
269
+ if sentences:
270
+ sentences = self.__run_clusters(
271
+ sentences, ratio, algorithm, use_first, num_sentences)
272
+
273
+ if return_as_list:
274
+ return sentences
275
+ else:
276
+ return ' '.join(sentences)
277
+
278
+ def __call__(
279
+ self,
280
+ body: str,
281
+ ratio: float = 0.2,
282
+ min_length: int = 40,
283
+ max_length: int = 600,
284
+ use_first: bool = True,
285
+ algorithm: str = 'kmeans',
286
+ num_sentences: int = None,
287
+ return_as_list: bool = False,
288
+ ) -> str:
289
+ """
290
+ (utility that wraps around the run function)
291
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
292
+
293
+ :param body: The raw string body to process.
294
+ :param ratio: Ratio of sentences to use.
295
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
296
+ :param max_length: Maximum length of sentence candidates to utilize for the summary.
297
+ :param use_first: Whether or not to use the first sentence.
298
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
299
+ :param Number of sentences to use (overrides ratio).
300
+ :param return_as_list: Whether or not to return sentences as list.
301
+ :return: A summary sentence.
302
+ """
303
+ return self.run(
304
+ body, ratio, min_length, max_length, algorithm=algorithm, use_first=use_first, num_sentences=num_sentences,
305
+ return_as_list=return_as_list
306
+ )
307
+
308
+
309
+ class Summarizer(ModelProcessor):
310
+
311
+ def __init__(
312
+ self,
313
+ model: str = 'bert-large-uncased',
314
+ custom_model: PreTrainedModel = None,
315
+ custom_tokenizer: PreTrainedTokenizer = None,
316
+ hidden: Union[List[int], int] = -2,
317
+ reduce_option: str = 'mean',
318
+ sentence_handler: SentenceHandler = SentenceHandler(),
319
+ random_state: int = 12345,
320
+ hidden_concat: bool = False,
321
+ gpu_id: int = 0,
322
+ ):
323
+ """
324
+ This is the main Bert Summarizer class.
325
+
326
+ :param model: This parameter is associated with the inherit string parameters from the transformers library.
327
+ :param custom_model: If you have a pre-trained model, you can add the model class here.
328
+ :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
329
+ :param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
330
+ :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
331
+ :param greedyness: associated with the neuralcoref library. Determines how greedy coref should be.
332
+ :param language: Which language to use for training.
333
+ :param random_state: The random state to reproduce summarizations.
334
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
335
+ :param gpu_id: GPU device index if CUDA is available.
336
+ """
337
+
338
+ super(Summarizer, self).__init__(
339
+ model, custom_model, custom_tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
340
+ )
341
+
342
+
343
+ class TransformerSummarizer(ModelProcessor):
344
+ """
345
+ Another type of Summarizer class to choose keyword based model and tokenizer
346
+ """
347
+
348
+ MODEL_DICT = {
349
+ 'Bert': (BertModel, BertTokenizer),
350
+ 'OpenAIGPT': (OpenAIGPTModel, OpenAIGPTTokenizer),
351
+ 'GPT2': (GPT2Model, GPT2Tokenizer),
352
+ 'CTRL': (CTRLModel, CTRLTokenizer),
353
+ 'TransfoXL': (TransfoXLModel, TransfoXLTokenizer),
354
+ 'XLNet': (XLNetModel, XLNetTokenizer),
355
+ 'XLM': (XLMModel, XLMTokenizer),
356
+ 'DistilBert': (DistilBertModel, DistilBertTokenizer),
357
+ }
358
+
359
+ def __init__(
360
+ self,
361
+ transformer_type: str = 'Bert',
362
+ transformer_model_key: str = 'bert-base-uncased',
363
+ transformer_tokenizer_key: str = None,
364
+ hidden: Union[List[int], int] = -2,
365
+ reduce_option: str = 'mean',
366
+ sentence_handler: SentenceHandler = SentenceHandler(),
367
+ random_state: int = 12345,
368
+ hidden_concat: bool = False,
369
+ gpu_id: int = 0,
370
+ ):
371
+ """
372
+ :param transformer_type: The Transformer type, such as Bert, GPT2, DistilBert, etc.
373
+ :param transformer_model_key: The transformer model key. This is the directory for the model.
374
+ :param transformer_tokenizer_key: The transformer tokenizer key. This is the tokenizer directory.
375
+ :param hidden: The hidden output layers to use for the summarization.
376
+ :param reduce_option: The reduce option, such as mean, max, min, median, etc.
377
+ :param sentence_handler: The sentence handler class to process the raw text.
378
+ :param random_state: The random state to use.
379
+ :param hidden_concat: Deprecated hidden concat option.
380
+ :param gpu_id: GPU device index if CUDA is available.
381
+ """
382
+ try:
383
+ self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
384
+ self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
385
+ self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
386
+ self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer)
387
+ self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer)
388
+ except Exception:
389
+ pass # older transformer version
390
+
391
+ model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
392
+ model = model_clz.from_pretrained(
393
+ transformer_model_key, output_hidden_states=True)
394
+
395
+ tokenizer = tokenizer_clz.from_pretrained(
396
+ transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key
397
+ )
398
+
399
+ super().__init__(
400
+ None, model, tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
401
+ )
extractive_summarizer/sentence_handler.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from spacy.lang.en import English
4
+
5
+
6
+ class SentenceHandler(object):
7
+
8
+ def __init__(self, language=English):
9
+ """
10
+ Base Sentence Handler with Spacy support.
11
+
12
+ :param language: Determines the language to use with spacy.
13
+ """
14
+ self.nlp = language()
15
+
16
+ try:
17
+ # Supports spacy 2.0
18
+ self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
19
+ self.is_spacy_3 = False
20
+ except Exception:
21
+ # Supports spacy 3.0
22
+ self.nlp.add_pipe("sentencizer")
23
+ self.is_spacy_3 = True
24
+
25
+ def sentence_processor(self, doc,
26
+ min_length: int = 40,
27
+ max_length: int = 600) -> List[str]:
28
+ """
29
+ Processes a given spacy document and turns them into sentences.
30
+
31
+ :param doc: The document to use from spacy.
32
+ :param min_length: The minimum length a sentence should be to be considered.
33
+ :param max_length: The maximum length a sentence should be to be considered.
34
+ :return: Sentences.
35
+ """
36
+ to_return = []
37
+
38
+ for c in doc.sents:
39
+ if max_length > len(c.text.strip()) > min_length:
40
+
41
+ if self.is_spacy_3:
42
+ to_return.append(c.text.strip())
43
+ else:
44
+ to_return.append(c.string.strip())
45
+
46
+ return to_return
47
+
48
+ def process(self, body: str,
49
+ min_length: int = 40,
50
+ max_length: int = 600) -> List[str]:
51
+ """
52
+ Processes the content sentences.
53
+
54
+ :param body: The raw string body to process
55
+ :param min_length: Minimum length that the sentences must be
56
+ :param max_length: Max length that the sentences mus fall under
57
+ :return: Returns a list of sentences.
58
+ """
59
+ doc = self.nlp(body)
60
+ return self.sentence_processor(doc, min_length, max_length)
61
+
62
+ def __call__(self, body: str,
63
+ min_length: int = 40,
64
+ max_length: int = 600) -> List[str]:
65
+ """
66
+ Processes the content sentences.
67
+
68
+ :param body: The raw string body to process
69
+ :param min_length: Minimum length that the sentences must be
70
+ :param max_length: Max length that the sentences mus fall under
71
+ :return: Returns a list of sentences.
72
+ """
73
+ return self.process(body, min_length, max_length)