Gladiator commited on
Commit
b04763d
·
1 Parent(s): 02df788

add summarizer code

Browse files
.gitignore ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ # local stuff
141
+ Docs/
src/vanilla_summarizer.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import streamlit as st
3
+ from transformers import BartTokenizer, BartForConditionalGeneration
4
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
5
+
6
+ st.title('Text Summarization Demo')
7
+ st.markdown('Using BART and T5 transformer model')
8
+
9
+ model = st.selectbox('Select the model', ('BART', 'T5'))
10
+
11
+ if model == 'BART':
12
+ _num_beams = 4
13
+ _no_repeat_ngram_size = 3
14
+ _length_penalty = 1
15
+ _min_length = 12
16
+ _max_length = 128
17
+ _early_stopping = True
18
+ else:
19
+ _num_beams = 4
20
+ _no_repeat_ngram_size = 3
21
+ _length_penalty = 2
22
+ _min_length = 30
23
+ _max_length = 200
24
+ _early_stopping = True
25
+
26
+ col1, col2, col3 = st.beta_columns(3)
27
+ _num_beams = col1.number_input("num_beams", value=_num_beams)
28
+ _no_repeat_ngram_size = col2.number_input("no_repeat_ngram_size", value=_no_repeat_ngram_size)
29
+ _length_penalty = col3.number_input("length_penalty", value=_length_penalty)
30
+
31
+ col1, col2, col3 = st.beta_columns(3)
32
+ _min_length = col1.number_input("min_length", value=_min_length)
33
+ _max_length = col2.number_input("max_length", value=_max_length)
34
+ _early_stopping = col3.number_input("early_stopping", value=_early_stopping)
35
+
36
+ text = st.text_area('Text Input')
37
+
38
+
39
+ def run_model(input_text):
40
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
41
+
42
+ if model == "BART":
43
+ bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
44
+ bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
45
+ input_text = str(input_text)
46
+ input_text = ' '.join(input_text.split())
47
+ input_tokenized = bart_tokenizer.encode(input_text, return_tensors='pt').to(device)
48
+ summary_ids = bart_model.generate(input_tokenized,
49
+ num_beams=_num_beams,
50
+ no_repeat_ngram_size=_no_repeat_ngram_size,
51
+ length_penalty=_length_penalty,
52
+ min_length=_min_length,
53
+ max_length=_max_length,
54
+ early_stopping=_early_stopping)
55
+
56
+ output = [bart_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in
57
+ summary_ids]
58
+ st.write('Summary')
59
+ st.success(output[0])
60
+
61
+ else:
62
+ t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
63
+ t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
64
+ input_text = str(input_text).replace('\n', '')
65
+ input_text = ' '.join(input_text.split())
66
+ input_tokenized = t5_tokenizer.encode(input_text, return_tensors="pt").to(device)
67
+ summary_task = torch.tensor([[21603, 10]]).to(device)
68
+ input_tokenized = torch.cat([summary_task, input_tokenized], dim=-1).to(device)
69
+ summary_ids = t5_model.generate(input_tokenized,
70
+ num_beams=_num_beams,
71
+ no_repeat_ngram_size=_no_repeat_ngram_size,
72
+ length_penalty=_length_penalty,
73
+ min_length=_min_length,
74
+ max_length=_max_length,
75
+ early_stopping=_early_stopping)
76
+ output = [t5_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in
77
+ summary_ids]
78
+ st.write('Summary')
79
+ st.success(output[0])
80
+
81
+
82
+ if st.button('Submit'):
83
+ run_model(text)
summarizer/bert_parent.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ from numpy import ndarray
6
+ from transformers import (AlbertModel, AlbertTokenizer, BertModel,
7
+ BertTokenizer, DistilBertModel, DistilBertTokenizer,
8
+ PreTrainedModel, PreTrainedTokenizer, XLMModel,
9
+ XLMTokenizer, XLNetModel, XLNetTokenizer)
10
+
11
+
12
+ class BertParent(object):
13
+ """
14
+ Base handler for BERT models.
15
+ """
16
+
17
+ MODELS = {
18
+ 'bert-base-uncased': (BertModel, BertTokenizer),
19
+ 'bert-large-uncased': (BertModel, BertTokenizer),
20
+ 'xlnet-base-cased': (XLNetModel, XLNetTokenizer),
21
+ 'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer),
22
+ 'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer),
23
+ 'albert-base-v1': (AlbertModel, AlbertTokenizer),
24
+ 'albert-large-v1': (AlbertModel, AlbertTokenizer)
25
+ }
26
+
27
+ def __init__(
28
+ self,
29
+ model: str,
30
+ custom_model: PreTrainedModel = None,
31
+ custom_tokenizer: PreTrainedTokenizer = None,
32
+ gpu_id: int = 0,
33
+ ):
34
+ """
35
+ :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
36
+ :param custom_model: This is optional if a custom bert model is used.
37
+ :param custom_tokenizer: Place to use custom tokenizer.
38
+ """
39
+ base_model, base_tokenizer = self.MODELS.get(model, (None, None))
40
+
41
+ self.device = torch.device("cpu")
42
+ if torch.cuda.is_available():
43
+ assert (
44
+ isinstance(gpu_id, int) and (0 <= gpu_id and gpu_id < torch.cuda.device_count())
45
+ ), f"`gpu_id` must be an integer between 0 to {torch.cuda.device_count() - 1}. But got: {gpu_id}"
46
+
47
+ self.device = torch.device(f"cuda:{gpu_id}")
48
+
49
+ if custom_model:
50
+ self.model = custom_model.to(self.device)
51
+ else:
52
+ self.model = base_model.from_pretrained(
53
+ model, output_hidden_states=True).to(self.device)
54
+
55
+ if custom_tokenizer:
56
+ self.tokenizer = custom_tokenizer
57
+ else:
58
+ self.tokenizer = base_tokenizer.from_pretrained(model)
59
+
60
+ self.model.eval()
61
+
62
+ def tokenize_input(self, text: str) -> torch.tensor:
63
+ """
64
+ Tokenizes the text input.
65
+ :param text: Text to tokenize.
66
+ :return: Returns a torch tensor.
67
+ """
68
+ tokenized_text = self.tokenizer.tokenize(text)
69
+ indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
70
+ return torch.tensor([indexed_tokens]).to(self.device)
71
+
72
+ def _pooled_handler(self, hidden: torch.Tensor,
73
+ reduce_option: str) -> torch.Tensor:
74
+ """
75
+ Handles torch tensor.
76
+ :param hidden: The hidden torch tensor to process.
77
+ :param reduce_option: The reduce option to use, such as mean, etc.
78
+ :return: Returns a torch tensor.
79
+ """
80
+
81
+ if reduce_option == 'max':
82
+ return hidden.max(dim=1)[0].squeeze()
83
+
84
+ elif reduce_option == 'median':
85
+ return hidden.median(dim=1)[0].squeeze()
86
+
87
+ return hidden.mean(dim=1).squeeze()
88
+
89
+ def extract_embeddings(
90
+ self,
91
+ text: str,
92
+ hidden: Union[List[int], int] = -2,
93
+ reduce_option: str = 'mean',
94
+ hidden_concat: bool = False,
95
+ ) -> torch.Tensor:
96
+ """
97
+ Extracts the embeddings for the given text.
98
+ :param text: The text to extract embeddings for.
99
+ :param hidden: The hidden layer(s) to use for a readout handler.
100
+ :param squeeze: If we should squeeze the outputs (required for some layers).
101
+ :param reduce_option: How we should reduce the items.
102
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
103
+ :return: A torch vector.
104
+ """
105
+ tokens_tensor = self.tokenize_input(text)
106
+ pooled, hidden_states = self.model(tokens_tensor)[-2:]
107
+
108
+ # deprecated temporary keyword functions.
109
+ if reduce_option == 'concat_last_4':
110
+ last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
111
+ cat_hidden_states = torch.cat(tuple(last_4), dim=-1)
112
+ return torch.mean(cat_hidden_states, dim=1).squeeze()
113
+
114
+ elif reduce_option == 'reduce_last_4':
115
+ last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
116
+ return torch.cat(tuple(last_4), dim=1).mean(axis=1).squeeze()
117
+
118
+ elif type(hidden) == int:
119
+ hidden_s = hidden_states[hidden]
120
+ return self._pooled_handler(hidden_s, reduce_option)
121
+
122
+ elif hidden_concat:
123
+ last_states = [hidden_states[i] for i in hidden]
124
+ cat_hidden_states = torch.cat(tuple(last_states), dim=-1)
125
+ return torch.mean(cat_hidden_states, dim=1).squeeze()
126
+
127
+ last_states = [hidden_states[i] for i in hidden]
128
+ hidden_s = torch.cat(tuple(last_states), dim=1)
129
+
130
+ return self._pooled_handler(hidden_s, reduce_option)
131
+
132
+ def create_matrix(
133
+ self,
134
+ content: List[str],
135
+ hidden: Union[List[int], int] = -2,
136
+ reduce_option: str = 'mean',
137
+ hidden_concat: bool = False,
138
+ ) -> ndarray:
139
+ """
140
+ Create matrix from the embeddings.
141
+ :param content: The list of sentences.
142
+ :param hidden: Which hidden layer to use.
143
+ :param reduce_option: The reduce option to run.
144
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
145
+ :return: A numpy array matrix of the given content.
146
+ """
147
+
148
+ return np.asarray([
149
+ np.squeeze(self.extract_embeddings(
150
+ t, hidden=hidden, reduce_option=reduce_option, hidden_concat=hidden_concat
151
+ ).data.cpu().numpy()) for t in content
152
+ ])
153
+
154
+ def __call__(
155
+ self,
156
+ content: List[str],
157
+ hidden: int = -2,
158
+ reduce_option: str = 'mean',
159
+ hidden_concat: bool = False,
160
+ ) -> ndarray:
161
+ """
162
+ Create matrix from the embeddings.
163
+ :param content: The list of sentences.
164
+ :param hidden: Which hidden layer to use.
165
+ :param reduce_option: The reduce option to run.
166
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
167
+ :return: A numpy array matrix of the given content.
168
+ """
169
+ return self.create_matrix(content, hidden, reduce_option, hidden_concat)
summarizer/cluster_features.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import numpy as np
4
+ from numpy import ndarray
5
+ from sklearn.cluster import KMeans
6
+ from sklearn.decomposition import PCA
7
+ from sklearn.mixture import GaussianMixture
8
+
9
+
10
+ class ClusterFeatures(object):
11
+ """
12
+ Basic handling of clustering features.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ features: ndarray,
18
+ algorithm: str = 'kmeans',
19
+ pca_k: int = None,
20
+ random_state: int = 12345,
21
+ ):
22
+ """
23
+ :param features: the embedding matrix created by bert parent.
24
+ :param algorithm: Which clustering algorithm to use.
25
+ :param pca_k: If you want the features to be ran through pca, this is the components number.
26
+ :param random_state: Random state.
27
+ """
28
+ if pca_k:
29
+ self.features = PCA(n_components=pca_k).fit_transform(features)
30
+ else:
31
+ self.features = features
32
+
33
+ self.algorithm = algorithm
34
+ self.pca_k = pca_k
35
+ self.random_state = random_state
36
+
37
+ def __get_model(self, k: int):
38
+ """
39
+ Retrieve clustering model.
40
+
41
+ :param k: amount of clusters.
42
+ :return: Clustering model.
43
+ """
44
+
45
+ if self.algorithm == 'gmm':
46
+ return GaussianMixture(n_components=k, random_state=self.random_state)
47
+ return KMeans(n_clusters=k, random_state=self.random_state)
48
+
49
+ def __get_centroids(self, model):
50
+ """
51
+ Retrieve centroids of model.
52
+
53
+ :param model: Clustering model.
54
+ :return: Centroids.
55
+ """
56
+ if self.algorithm == 'gmm':
57
+ return model.means_
58
+ return model.cluster_centers_
59
+
60
+ def __find_closest_args(self, centroids: np.ndarray) -> Dict:
61
+ """
62
+ Find the closest arguments to centroid.
63
+
64
+ :param centroids: Centroids to find closest.
65
+ :return: Closest arguments.
66
+ """
67
+ centroid_min = 1e10
68
+ cur_arg = -1
69
+ args = {}
70
+ used_idx = []
71
+
72
+ for j, centroid in enumerate(centroids):
73
+
74
+ for i, feature in enumerate(self.features):
75
+ value = np.linalg.norm(feature - centroid)
76
+
77
+ if value < centroid_min and i not in used_idx:
78
+ cur_arg = i
79
+ centroid_min = value
80
+
81
+ used_idx.append(cur_arg)
82
+ args[j] = cur_arg
83
+ centroid_min = 1e10
84
+ cur_arg = -1
85
+
86
+ return args
87
+
88
+ def calculate_elbow(self, k_max: int) -> List[float]:
89
+ """
90
+ Calculates elbow up to the provided k_max.
91
+
92
+ :param k_max: K_max to calculate elbow for.
93
+ :return: The inertias up to k_max.
94
+ """
95
+ inertias = []
96
+
97
+ for k in range(1, min(k_max, len(self.features))):
98
+ model = self.__get_model(k).fit(self.features)
99
+
100
+ inertias.append(model.inertia_)
101
+
102
+ return inertias
103
+
104
+ def calculate_optimal_cluster(self, k_max: int):
105
+ """
106
+ Calculates the optimal cluster based on Elbow.
107
+
108
+ :param k_max: The max k to search elbow for.
109
+ :return: The optimal cluster size.
110
+ """
111
+ delta_1 = []
112
+ delta_2 = []
113
+
114
+ max_strength = 0
115
+ k = 1
116
+
117
+ inertias = self.calculate_elbow(k_max)
118
+
119
+ for i in range(len(inertias)):
120
+ delta_1.append(inertias[i] - inertias[i - 1] if i > 0 else 0.0)
121
+ delta_2.append(delta_1[i] - delta_1[i - 1] if i > 1 else 0.0)
122
+
123
+ for j in range(len(inertias)):
124
+ strength = 0 if j <= 1 or j == len(inertias) - 1 else delta_2[j + 1] - delta_1[j + 1]
125
+
126
+ if strength > max_strength:
127
+ max_strength = strength
128
+ k = j + 1
129
+
130
+ return k
131
+
132
+ def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
133
+ """
134
+ Clusters sentences based on the ratio.
135
+
136
+ :param ratio: Ratio to use for clustering.
137
+ :param num_sentences: Number of sentences. Overrides ratio.
138
+ :return: Sentences index that qualify for summary.
139
+ """
140
+
141
+ if num_sentences is not None:
142
+ if num_sentences == 0:
143
+ return []
144
+
145
+ k = min(num_sentences, len(self.features))
146
+ else:
147
+ k = max(int(len(self.features) * ratio), 1)
148
+
149
+ model = self.__get_model(k).fit(self.features)
150
+
151
+ centroids = self.__get_centroids(model)
152
+ cluster_args = self.__find_closest_args(centroids)
153
+
154
+ sorted_values = sorted(cluster_args.values())
155
+ return sorted_values
156
+
157
+ def __call__(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
158
+ """
159
+ Clusters sentences based on the ratio.
160
+
161
+ :param ratio: Ratio to use for clustering.
162
+ :param num_sentences: Number of sentences. Overrides ratio.
163
+ :return: Sentences index that qualify for summary.
164
+ """
165
+ return self.cluster(ratio)
summarizer/coreference_handler.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # removed previous import and related functionality since it's just a blank language model,
2
+ # while neuralcoref requires passing pretrained language model via spacy.load()
3
+
4
+ import neuralcoref
5
+ import spacy
6
+
7
+ from summarizer.sentence_handler import SentenceHandler
8
+
9
+
10
+ class CoreferenceHandler(SentenceHandler):
11
+
12
+ def __init__(self, spacy_model: str = 'en_core_web_sm',
13
+ greedyness: float = 0.45):
14
+ """
15
+ Corefence handler. Only works with spacy < 3.0.
16
+
17
+ :param spacy_model: The spacy model to use as default.
18
+ :param greedyness: The greedyness factor.
19
+ """
20
+ self.nlp = spacy.load(spacy_model)
21
+ neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness)
22
+
23
+ def process(self, body: str, min_length: int = 40, max_length: int = 600):
24
+ """
25
+ Processes the content sentences.
26
+
27
+ :param body: The raw string body to process
28
+ :param min_length: Minimum length that the sentences must be
29
+ :param max_length: Max length that the sentences mus fall under
30
+ :return: Returns a list of sentences.
31
+ """
32
+ doc = self.nlp(body)._.coref_resolved
33
+ doc = self.nlp(doc)
34
+ return [c.string.strip()
35
+ for c in doc.sents
36
+ if max_length > len(c.string.strip()) > min_length]
summarizer/model_processors.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+
3
+ import numpy as np
4
+ from transformers import (AlbertModel, AlbertTokenizer, BartModel,
5
+ BartTokenizer, BertModel, BertTokenizer,
6
+ CamembertModel, CamembertTokenizer, CTRLModel,
7
+ CTRLTokenizer, DistilBertModel, DistilBertTokenizer,
8
+ GPT2Model, GPT2Tokenizer, LongformerModel,
9
+ LongformerTokenizer, OpenAIGPTModel,
10
+ OpenAIGPTTokenizer, PreTrainedModel,
11
+ PreTrainedTokenizer, RobertaModel, RobertaTokenizer,
12
+ TransfoXLModel, TransfoXLTokenizer, XLMModel,
13
+ XLMTokenizer, XLNetModel, XLNetTokenizer)
14
+
15
+ from summarizer.bert_parent import BertParent
16
+ from summarizer.cluster_features import ClusterFeatures
17
+ from summarizer.sentence_handler import SentenceHandler
18
+
19
+
20
+ class ModelProcessor(object):
21
+ aggregate_map = {
22
+ 'mean': np.mean,
23
+ 'min': np.min,
24
+ 'median': np.median,
25
+ 'max': np.max,
26
+ }
27
+
28
+ def __init__(
29
+ self,
30
+ model: str = 'bert-large-uncased',
31
+ custom_model: PreTrainedModel = None,
32
+ custom_tokenizer: PreTrainedTokenizer = None,
33
+ hidden: Union[List[int], int] = -2,
34
+ reduce_option: str = 'mean',
35
+ sentence_handler: SentenceHandler = SentenceHandler(),
36
+ random_state: int = 12345,
37
+ hidden_concat: bool = False,
38
+ gpu_id: int = 0,
39
+ ):
40
+ """
41
+ This is the parent Bert Summarizer model. New methods should implement this class.
42
+
43
+ :param model: This parameter is associated with the inherit string parameters from the transformers library.
44
+ :param custom_model: If you have a pre-trained model, you can add the model class here.
45
+ :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
46
+ :param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings.
47
+ :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
48
+ :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass.
49
+ CoreferenceHandler instance
50
+ :param random_state: The random state to reproduce summarizations.
51
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
52
+ :param gpu_id: GPU device index if CUDA is available.
53
+ """
54
+ np.random.seed(random_state)
55
+ self.model = BertParent(model, custom_model, custom_tokenizer, gpu_id)
56
+ self.hidden = hidden
57
+ self.reduce_option = reduce_option
58
+ self.sentence_handler = sentence_handler
59
+ self.random_state = random_state
60
+ self.hidden_concat = hidden_concat
61
+
62
+ def cluster_runner(
63
+ self,
64
+ content: List[str],
65
+ ratio: float = 0.2,
66
+ algorithm: str = 'kmeans',
67
+ use_first: bool = True,
68
+ num_sentences: int = None
69
+ ) -> Tuple[List[str], np.ndarray]:
70
+ """
71
+ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.
72
+
73
+ :param content: Content list of sentences.
74
+ :param ratio: The ratio to use for clustering.
75
+ :param algorithm: Type of algorithm to use for clustering.
76
+ :param use_first: Return the first sentence in the output (helpful for news stories, etc).
77
+ :param num_sentences: Number of sentences to use for summarization.
78
+ :return: A tuple of summarized sentences and embeddings
79
+ """
80
+ if num_sentences is not None:
81
+ num_sentences = num_sentences if use_first else num_sentences
82
+
83
+ hidden = self.model(
84
+ content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat)
85
+ hidden_args = ClusterFeatures(
86
+ hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences)
87
+
88
+ if use_first:
89
+
90
+ if not hidden_args:
91
+ hidden_args.append(0)
92
+
93
+ elif hidden_args[0] != 0:
94
+ hidden_args.insert(0, 0)
95
+
96
+ sentences = [content[j] for j in hidden_args]
97
+ embeddings = np.asarray([hidden[j] for j in hidden_args])
98
+
99
+ return sentences, embeddings
100
+
101
+ def __run_clusters(
102
+ self,
103
+ content: List[str],
104
+ ratio: float = 0.2,
105
+ algorithm: str = 'kmeans',
106
+ use_first: bool = True,
107
+ num_sentences: int = None
108
+ ) -> List[str]:
109
+ """
110
+ Runs clusters and returns sentences.
111
+
112
+ :param content: The content of sentences.
113
+ :param ratio: Ratio to use for for clustering.
114
+ :param algorithm: Algorithm selection for clustering.
115
+ :param use_first: Whether to use first sentence
116
+ :param num_sentences: Number of sentences. Overrides ratio.
117
+ :return: summarized sentences
118
+ """
119
+ sentences, _ = self.cluster_runner(
120
+ content, ratio, algorithm, use_first, num_sentences)
121
+ return sentences
122
+
123
+ def __retrieve_summarized_embeddings(
124
+ self,
125
+ content: List[str],
126
+ ratio: float = 0.2,
127
+ algorithm: str = 'kmeans',
128
+ use_first: bool = True,
129
+ num_sentences: int = None
130
+ ) -> np.ndarray:
131
+ """
132
+ Retrieves embeddings of the summarized sentences.
133
+
134
+ :param content: The content of sentences.
135
+ :param ratio: Ratio to use for for clustering.
136
+ :param algorithm: Algorithm selection for clustering.
137
+ :param use_first: Whether to use first sentence
138
+ :return: Summarized embeddings
139
+ """
140
+ _, embeddings = self.cluster_runner(
141
+ content, ratio, algorithm, use_first, num_sentences)
142
+ return embeddings
143
+
144
+ def calculate_elbow(
145
+ self,
146
+ body: str,
147
+ algorithm: str = 'kmeans',
148
+ min_length: int = 40,
149
+ max_length: int = 600,
150
+ k_max: int = None,
151
+ ) -> List[float]:
152
+ """
153
+ Calculates elbow across the clusters.
154
+
155
+ :param body: The input body to summarize.
156
+ :param algorithm: The algorithm to use for clustering.
157
+ :param min_length: The min length to use.
158
+ :param max_length: The max length to use.
159
+ :param k_max: The maximum number of clusters to search.
160
+ :return: List of elbow inertia values.
161
+ """
162
+ sentences = self.sentence_handler(body, min_length, max_length)
163
+
164
+ if k_max is None:
165
+ k_max = len(sentences) - 1
166
+
167
+ hidden = self.model(sentences, self.hidden,
168
+ self.reduce_option, hidden_concat=self.hidden_concat)
169
+ elbow = ClusterFeatures(
170
+ hidden, algorithm, random_state=self.random_state).calculate_elbow(k_max)
171
+
172
+ return elbow
173
+
174
+ def calculate_optimal_k(
175
+ self,
176
+ body: str,
177
+ algorithm: str = 'kmeans',
178
+ min_length: int = 40,
179
+ max_length: int = 600,
180
+ k_max: int = None,
181
+ ):
182
+ """
183
+ Calculates the optimal Elbow K.
184
+
185
+ :param body: The input body to summarize.
186
+ :param algorithm: The algorithm to use for clustering.
187
+ :param min_length: The min length to use.
188
+ :param max_length: The max length to use.
189
+ :param k_max: The maximum number of clusters to search.
190
+ :return:
191
+ """
192
+ sentences = self.sentence_handler(body, min_length, max_length)
193
+
194
+ if k_max is None:
195
+ k_max = len(sentences) - 1
196
+
197
+ hidden = self.model(sentences, self.hidden,
198
+ self.reduce_option, hidden_concat=self.hidden_concat)
199
+ optimal_k = ClusterFeatures(
200
+ hidden, algorithm, random_state=self.random_state).calculate_optimal_cluster(k_max)
201
+
202
+ return optimal_k
203
+
204
+ def run_embeddings(
205
+ self,
206
+ body: str,
207
+ ratio: float = 0.2,
208
+ min_length: int = 40,
209
+ max_length: int = 600,
210
+ use_first: bool = True,
211
+ algorithm: str = 'kmeans',
212
+ num_sentences: int = None,
213
+ aggregate: str = None,
214
+ ) -> Optional[np.ndarray]:
215
+ """
216
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the embeddings.
217
+
218
+ :param body: The raw string body to process
219
+ :param ratio: Ratio of sentences to use
220
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
221
+ :param max_length: Maximum length of sentence candidates to utilize for the summary
222
+ :param use_first: Whether or not to use the first sentence
223
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
224
+ :param num_sentences: Number of sentences to use. Overrides ratio.
225
+ :param aggregate: One of mean, median, max, min. Applied on zero axis
226
+ :return: A summary embedding
227
+ """
228
+ sentences = self.sentence_handler(body, min_length, max_length)
229
+
230
+ if sentences:
231
+ embeddings = self.__retrieve_summarized_embeddings(
232
+ sentences, ratio, algorithm, use_first, num_sentences)
233
+
234
+ if aggregate is not None:
235
+ assert aggregate in [
236
+ 'mean', 'median', 'max', 'min'], "aggregate must be mean, min, max, or median"
237
+ embeddings = self.aggregate_map[aggregate](embeddings, axis=0)
238
+
239
+ return embeddings
240
+
241
+ return None
242
+
243
+ def run(
244
+ self,
245
+ body: str,
246
+ ratio: float = 0.2,
247
+ min_length: int = 40,
248
+ max_length: int = 600,
249
+ use_first: bool = True,
250
+ algorithm: str = 'kmeans',
251
+ num_sentences: int = None,
252
+ return_as_list: bool = False
253
+ ) -> Union[List, str]:
254
+ """
255
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
256
+
257
+ :param body: The raw string body to process
258
+ :param ratio: Ratio of sentences to use
259
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
260
+ :param max_length: Maximum length of sentence candidates to utilize for the summary
261
+ :param use_first: Whether or not to use the first sentence
262
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
263
+ :param num_sentences: Number of sentences to use (overrides ratio).
264
+ :param return_as_list: Whether or not to return sentences as list.
265
+ :return: A summary sentence
266
+ """
267
+ sentences = self.sentence_handler(body, min_length, max_length)
268
+
269
+ if sentences:
270
+ sentences = self.__run_clusters(
271
+ sentences, ratio, algorithm, use_first, num_sentences)
272
+
273
+ if return_as_list:
274
+ return sentences
275
+ else:
276
+ return ' '.join(sentences)
277
+
278
+ def __call__(
279
+ self,
280
+ body: str,
281
+ ratio: float = 0.2,
282
+ min_length: int = 40,
283
+ max_length: int = 600,
284
+ use_first: bool = True,
285
+ algorithm: str = 'kmeans',
286
+ num_sentences: int = None,
287
+ return_as_list: bool = False,
288
+ ) -> str:
289
+ """
290
+ (utility that wraps around the run function)
291
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
292
+
293
+ :param body: The raw string body to process.
294
+ :param ratio: Ratio of sentences to use.
295
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
296
+ :param max_length: Maximum length of sentence candidates to utilize for the summary.
297
+ :param use_first: Whether or not to use the first sentence.
298
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
299
+ :param Number of sentences to use (overrides ratio).
300
+ :param return_as_list: Whether or not to return sentences as list.
301
+ :return: A summary sentence.
302
+ """
303
+ return self.run(
304
+ body, ratio, min_length, max_length, algorithm=algorithm, use_first=use_first, num_sentences=num_sentences,
305
+ return_as_list=return_as_list
306
+ )
307
+
308
+
309
+ class Summarizer(ModelProcessor):
310
+
311
+ def __init__(
312
+ self,
313
+ model: str = 'bert-large-uncased',
314
+ custom_model: PreTrainedModel = None,
315
+ custom_tokenizer: PreTrainedTokenizer = None,
316
+ hidden: Union[List[int], int] = -2,
317
+ reduce_option: str = 'mean',
318
+ sentence_handler: SentenceHandler = SentenceHandler(),
319
+ random_state: int = 12345,
320
+ hidden_concat: bool = False,
321
+ gpu_id: int = 0,
322
+ ):
323
+ """
324
+ This is the main Bert Summarizer class.
325
+
326
+ :param model: This parameter is associated with the inherit string parameters from the transformers library.
327
+ :param custom_model: If you have a pre-trained model, you can add the model class here.
328
+ :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
329
+ :param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
330
+ :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
331
+ :param greedyness: associated with the neuralcoref library. Determines how greedy coref should be.
332
+ :param language: Which language to use for training.
333
+ :param random_state: The random state to reproduce summarizations.
334
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
335
+ :param gpu_id: GPU device index if CUDA is available.
336
+ """
337
+
338
+ super(Summarizer, self).__init__(
339
+ model, custom_model, custom_tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
340
+ )
341
+
342
+
343
+ class TransformerSummarizer(ModelProcessor):
344
+ """
345
+ Newer style that has keywords for models and tokenizers, but allows the user to change the type.
346
+ """
347
+
348
+ MODEL_DICT = {
349
+ 'Bert': (BertModel, BertTokenizer),
350
+ 'OpenAIGPT': (OpenAIGPTModel, OpenAIGPTTokenizer),
351
+ 'GPT2': (GPT2Model, GPT2Tokenizer),
352
+ 'CTRL': (CTRLModel, CTRLTokenizer),
353
+ 'TransfoXL': (TransfoXLModel, TransfoXLTokenizer),
354
+ 'XLNet': (XLNetModel, XLNetTokenizer),
355
+ 'XLM': (XLMModel, XLMTokenizer),
356
+ 'DistilBert': (DistilBertModel, DistilBertTokenizer),
357
+ }
358
+
359
+ def __init__(
360
+ self,
361
+ transformer_type: str = 'Bert',
362
+ transformer_model_key: str = 'bert-base-uncased',
363
+ transformer_tokenizer_key: str = None,
364
+ hidden: Union[List[int], int] = -2,
365
+ reduce_option: str = 'mean',
366
+ sentence_handler: SentenceHandler = SentenceHandler(),
367
+ random_state: int = 12345,
368
+ hidden_concat: bool = False,
369
+ gpu_id: int = 0,
370
+ ):
371
+ """
372
+ :param transformer_type: The Transformer type, such as Bert, GPT2, DistilBert, etc.
373
+ :param transformer_model_key: The transformer model key. This is the directory for the model.
374
+ :param transformer_tokenizer_key: The transformer tokenizer key. This is the tokenizer directory.
375
+ :param hidden: The hidden output layers to use for the summarization.
376
+ :param reduce_option: The reduce option, such as mean, max, min, median, etc.
377
+ :param sentence_handler: The sentence handler class to process the raw text.
378
+ :param random_state: The random state to use.
379
+ :param hidden_concat: Deprecated hidden concat option.
380
+ :param gpu_id: GPU device index if CUDA is available.
381
+ """
382
+ try:
383
+ self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
384
+ self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
385
+ self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
386
+ self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer)
387
+ self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer)
388
+ except Exception:
389
+ pass # older transformer version
390
+
391
+ model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
392
+ model = model_clz.from_pretrained(
393
+ transformer_model_key, output_hidden_states=True)
394
+
395
+ tokenizer = tokenizer_clz.from_pretrained(
396
+ transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key
397
+ )
398
+
399
+ super().__init__(
400
+ None, model, tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
401
+ )
summarizer/sentence_handler.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from spacy.lang.en import English
4
+
5
+
6
+ class SentenceHandler(object):
7
+
8
+ def __init__(self, language=English):
9
+ """
10
+ Base Sentence Handler with Spacy support.
11
+
12
+ :param language: Determines the language to use with spacy.
13
+ """
14
+ self.nlp = language()
15
+
16
+ try:
17
+ # Supports spacy 2.0
18
+ self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
19
+ self.is_spacy_3 = False
20
+ except Exception:
21
+ # Supports spacy 3.0
22
+ self.nlp.add_pipe("sentencizer")
23
+ self.is_spacy_3 = True
24
+
25
+ def sentence_processor(self, doc,
26
+ min_length: int = 40,
27
+ max_length: int = 600) -> List[str]:
28
+ """
29
+ Processes a given spacy document and turns them into sentences.
30
+
31
+ :param doc: The document to use from spacy.
32
+ :param min_length: The minimum length a sentence should be to be considered.
33
+ :param max_length: The maximum length a sentence should be to be considered.
34
+ :return: Sentences.
35
+ """
36
+ to_return = []
37
+
38
+ for c in doc.sents:
39
+ if max_length > len(c.text.strip()) > min_length:
40
+
41
+ if self.is_spacy_3:
42
+ to_return.append(c.text.strip())
43
+ else:
44
+ to_return.append(c.string.strip())
45
+
46
+ return to_return
47
+
48
+ def process(self, body: str,
49
+ min_length: int = 40,
50
+ max_length: int = 600) -> List[str]:
51
+ """
52
+ Processes the content sentences.
53
+
54
+ :param body: The raw string body to process
55
+ :param min_length: Minimum length that the sentences must be
56
+ :param max_length: Max length that the sentences mus fall under
57
+ :return: Returns a list of sentences.
58
+ """
59
+ doc = self.nlp(body)
60
+ return self.sentence_processor(doc, min_length, max_length)
61
+
62
+ def __call__(self, body: str,
63
+ min_length: int = 40,
64
+ max_length: int = 600) -> List[str]:
65
+ """
66
+ Processes the content sentences.
67
+
68
+ :param body: The raw string body to process
69
+ :param min_length: Minimum length that the sentences must be
70
+ :param max_length: Max length that the sentences mus fall under
71
+ :return: Returns a list of sentences.
72
+ """
73
+ return self.process(body, min_length, max_length)