theekshana commited on
Commit
0528be1
1 Parent(s): 6e0e411
.gitignore ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ # local stuff
141
+ Docs/
142
+ .DS_Store
143
+ .vscode/
144
+ test.ipynb
145
+ test.py
README.md CHANGED
@@ -7,7 +7,6 @@ sdk: streamlit
7
  sdk_version: 1.34.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 1.34.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import validators
3
+ import streamlit as st
4
+ from transformers import AutoTokenizer, pipeline
5
+
6
+ # local modules
7
+ from extractive_summarizer.model_processors import Summarizer
8
+ from utils import (
9
+ clean_text,
10
+ fetch_article_text,
11
+ preprocess_text_for_abstractive_summarization,
12
+ read_text_from_file,
13
+ )
14
+
15
+ from rouge import Rouge
16
+
17
+ if __name__ == "__main__":
18
+ # ---------------------------------
19
+ # Main Application
20
+ # ---------------------------------
21
+ st.title("Text Summarizer 📝")
22
+
23
+ st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
24
+ st.markdown(
25
+ "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
26
+ )
27
+ summarize_type = st.sidebar.selectbox(
28
+ "Summarization type", options=["Extractive", "Abstractive"]
29
+ )
30
+
31
+ st.markdown(
32
+ "Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
33
+ )
34
+ st.markdown(
35
+ """- Raw text in text box
36
+ - URL of article/news to be summarized
37
+ - .txt, .pdf, .docx file formats"""
38
+ )
39
+ st.markdown(
40
+ """This app supports two type of summarization:
41
+
42
+ 1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
43
+ 2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
44
+ )
45
+ st.markdown("---")
46
+ # ---------------------------
47
+ # SETUP & Constants
48
+ nltk.download("punkt")
49
+ abs_tokenizer_name = "facebook/bart-large-cnn"
50
+ abs_model_name = "facebook/bart-large-cnn"
51
+ abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
52
+ abs_max_length = 90
53
+ abs_min_length = 30
54
+ # ---------------------------
55
+
56
+ inp_text = st.text_input("Enter text or a url here")
57
+ st.markdown(
58
+ "<h3 style='text-align: center; color: green;'>OR</h3>",
59
+ unsafe_allow_html=True,
60
+ )
61
+ uploaded_file = st.file_uploader(
62
+ "Upload a .txt, .pdf, .docx file for summarization"
63
+ )
64
+
65
+ is_url = validators.url(inp_text)
66
+ if is_url:
67
+ # complete text, chunks to summarize (list of sentences for long docs)
68
+ text, cleaned_txt = fetch_article_text(url=inp_text)
69
+ elif uploaded_file:
70
+ cleaned_txt = read_text_from_file(uploaded_file)
71
+ cleaned_txt = clean_text(cleaned_txt)
72
+ else:
73
+ cleaned_txt = clean_text(inp_text)
74
+
75
+ # view summarized text (expander)
76
+ with st.expander("View input text"):
77
+ if is_url:
78
+ st.write(cleaned_txt[0])
79
+ else:
80
+ st.write(cleaned_txt)
81
+ summarize = st.button("Summarize")
82
+
83
+ # called on toggle button [summarize]
84
+ if summarize:
85
+ if summarize_type == "Extractive":
86
+ if is_url:
87
+ text_to_summarize = " ".join([txt for txt in cleaned_txt])
88
+ else:
89
+ text_to_summarize = cleaned_txt
90
+ # extractive summarizer
91
+
92
+ with st.spinner(
93
+ text="Creating extractive summary. This might take a few seconds ..."
94
+ ):
95
+ ext_model = Summarizer()
96
+ summarized_text = ext_model(text_to_summarize, num_sentences=5)
97
+
98
+ elif summarize_type == "Abstractive":
99
+ with st.spinner(
100
+ text="Creating abstractive summary. This might take a few seconds ..."
101
+ ):
102
+ text_to_summarize = cleaned_txt
103
+ abs_summarizer = pipeline(
104
+ "summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
105
+ )
106
+
107
+ if is_url is False:
108
+ # list of chunks
109
+ text_to_summarize = preprocess_text_for_abstractive_summarization(
110
+ tokenizer=abs_tokenizer, text=cleaned_txt
111
+ )
112
+
113
+ tmp_sum = abs_summarizer(
114
+ text_to_summarize,
115
+ max_length=abs_max_length,
116
+ min_length=abs_min_length,
117
+ do_sample=False,
118
+ )
119
+
120
+ summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
121
+
122
+ # final summarized output
123
+ st.subheader("Summarized text")
124
+ st.info(summarized_text)
125
+
126
+ st.subheader("Rogue Scores")
127
+ rouge_sc = Rouge()
128
+ ground_truth = cleaned_txt[0] if is_url else cleaned_txt
129
+ score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
130
+ st.code(score)
examples/tfile.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ New York: Gun violence has rocked the first three weeks of Eric Adams' tenure as mayor of New York, piling pressure on the ex-cop to deliver on his promise to improve public safety in America's largest city.
2
+ A shooting Friday in the neighborhood of Harlem left one police officer dead and another in critical condition.
3
+
4
+ It was the latest flashpoint in the Democratic mayor's nascent rule, in which he has yet to present a comprehensive plan to rein in the crime he has decried.
5
+
6
+ "It is our city against the killers," said Adams, a retired police captain, on Friday night at Harlem Hospital, where the officers -- who had been responding to a domestic disturbance -- were taken following the incident.
7
+
8
+ The recent shootings also include a shocking incident in which an 11-month-old girl was hit in the cheek by a stray bullet in the Bronx as she was in a parked car with her mother.
9
+
10
+ They are seen as part of a broader trend of gun violence fueled by the accessibility of firearms, against the backdrop of the social and economic toll of the Covid-19 pandemic.
11
+
12
+ And they're testing the new mayor's tough-on-crime campaign message, while setting up a potential showdown with the left flank of his party over police funding and crime reduction strategies.
13
+
14
+ "This is a sea of crime that's been fed by many rivers. We have to dam each one of those rivers," Adams told CNN's "State of the Union" talk show Sunday.
15
+
16
+ "These crimes did not start during my administration," he added. "They have been here for far too long in many parts of our community."
17
+
18
+ Earlier, Adams urged federal action on gun control while calling on New Yorkers to work with the police to stem violence.
19
+
20
+ "No matter how painful this moment is, don't give up on these people in this city," he said Friday.
21
+
22
+ Budget negotiations
23
+
24
+ Adams, 61, has clashed with his leftist critics, many of whom are vocal online and have pushed to "defund" the New York Police Department, the nation's largest.
25
+
26
+ Now that call may be coming to a head as Adams, whose position on policing has long rankled New Yorkers on the left, prepares to negotiate a new city budget.
27
+
28
+ He said recently he would consider exempting the police force, with a budget exceeding $5 billion, from citywide cost-cutting measures.
29
+
30
+ It was not clear whether those details would be part of the "real plan" for the city Adams said Sunday he would roll out this week.
31
+
32
+ Politicians who use "defund the police" as a rallying cry appear unlikely to give any leeway to Adams, who has already aggravated progressives over issues including remote learning.
33
+
34
+ Kristin Richardson Jordan, a leftist city council member, won her Harlem district on a "defund" platform, which advocates replacing policing with alternative public safety systems.
35
+
36
+ She expressed sadness over the killing of the police officer Friday, but added: "To be clear, the death of police officers is not what abolition is. Abolition is an end to violence altogether."
37
+
38
+ Blueprint for safety
39
+
40
+ Last year, police recorded 488 homicides in the city of nine million people, up 4.3 percent from 2020 -- though Jeffrey Butts, director of the research and evaluation center at John Jay College of Criminal Justice, points out that 25 years ago New York experienced four times the number of homicides it sees today.
41
+
42
+ While saying he disagrees with the notion of "defunding the police," Butts also told AFP "more police funding is not an appropriate response."
43
+
44
+ "How are those resources used? To what end? What's the strategy?" he said. "The foundation of our approach has to be economic well-being, health and the well-being of communities, which is a much broader public policy conversation."
45
+
46
+ Adriano Espaillat, a congressman whose district includes Harlem and parts of the Bronx, said Saturday "the federal government must play a pivotal role" in stemming the violence, citing a need for legislation mandating stronger background checks and accountability of gun manufacturers.
47
+
48
+ Ken Sherrill, a professor emeritus of political science at Hunter College, expressed surprise that Adams has not yet unveiled his pitch to tackle crime -- but said this is the moment to "mold public opinion."
49
+
50
+ "This hands the mayor an immense opportunity, and if he doesn't seize it I'm sure he will regret it," Sherrill told AFP.
51
+
52
+ Adams offered scant details about his upcoming public safety blueprint, but he said Sunday it would include the reinstitution of a "plainclothes anti-gun unit" and a bolstered police presence in the city's sprawling subway system.
53
+
54
+
55
+ But a top priority will be firearms: "We have to stop the flow of guns," Adams said.
extractive_summarizer/bert_parent.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ import torch
4
+ import streamlit as st
5
+ import numpy as np
6
+ from numpy import ndarray
7
+ from transformers import (AlbertModel, AlbertTokenizer, BertModel,
8
+ BertTokenizer, DistilBertModel, DistilBertTokenizer,
9
+ PreTrainedModel, PreTrainedTokenizer, XLMModel,
10
+ XLMTokenizer, XLNetModel, XLNetTokenizer)
11
+
12
+ @st.cache()
13
+ def load_hf_model(base_model, model_name, device):
14
+ model = base_model.from_pretrained(model_name, output_hidden_states=True).to(device)
15
+ return model
16
+
17
+ class BertParent(object):
18
+ """
19
+ Base handler for BERT models.
20
+ """
21
+
22
+ MODELS = {
23
+ 'bert-base-uncased': (BertModel, BertTokenizer),
24
+ 'bert-large-uncased': (BertModel, BertTokenizer),
25
+ 'xlnet-base-cased': (XLNetModel, XLNetTokenizer),
26
+ 'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer),
27
+ 'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer),
28
+ 'albert-base-v1': (AlbertModel, AlbertTokenizer),
29
+ 'albert-large-v1': (AlbertModel, AlbertTokenizer)
30
+ }
31
+
32
+ def __init__(
33
+ self,
34
+ model: str,
35
+ custom_model: PreTrainedModel = None,
36
+ custom_tokenizer: PreTrainedTokenizer = None,
37
+ gpu_id: int = 0,
38
+ ):
39
+ """
40
+ :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
41
+ :param custom_model: This is optional if a custom bert model is used.
42
+ :param custom_tokenizer: Place to use custom tokenizer.
43
+ """
44
+ base_model, base_tokenizer = self.MODELS.get(model, (None, None))
45
+
46
+ self.device = torch.device("cpu")
47
+ if torch.cuda.is_available():
48
+ assert (
49
+ isinstance(gpu_id, int) and (0 <= gpu_id and gpu_id < torch.cuda.device_count())
50
+ ), f"`gpu_id` must be an integer between 0 to {torch.cuda.device_count() - 1}. But got: {gpu_id}"
51
+
52
+ self.device = torch.device(f"cuda:{gpu_id}")
53
+
54
+ if custom_model:
55
+ self.model = custom_model.to(self.device)
56
+ else:
57
+ # self.model = base_model.from_pretrained(
58
+ # model, output_hidden_states=True).to(self.device)
59
+ self.model = load_hf_model(base_model, model, self.device)
60
+
61
+ if custom_tokenizer:
62
+ self.tokenizer = custom_tokenizer
63
+ else:
64
+ self.tokenizer = base_tokenizer.from_pretrained(model)
65
+
66
+ self.model.eval()
67
+
68
+
69
+ def tokenize_input(self, text: str) -> torch.tensor:
70
+ """
71
+ Tokenizes the text input.
72
+ :param text: Text to tokenize.
73
+ :return: Returns a torch tensor.
74
+ """
75
+ tokenized_text = self.tokenizer.tokenize(text)
76
+ indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
77
+ return torch.tensor([indexed_tokens]).to(self.device)
78
+
79
+ def _pooled_handler(self, hidden: torch.Tensor,
80
+ reduce_option: str) -> torch.Tensor:
81
+ """
82
+ Handles torch tensor.
83
+ :param hidden: The hidden torch tensor to process.
84
+ :param reduce_option: The reduce option to use, such as mean, etc.
85
+ :return: Returns a torch tensor.
86
+ """
87
+
88
+ if reduce_option == 'max':
89
+ return hidden.max(dim=1)[0].squeeze()
90
+
91
+ elif reduce_option == 'median':
92
+ return hidden.median(dim=1)[0].squeeze()
93
+
94
+ return hidden.mean(dim=1).squeeze()
95
+
96
+ def extract_embeddings(
97
+ self,
98
+ text: str,
99
+ hidden: Union[List[int], int] = -2,
100
+ reduce_option: str = 'mean',
101
+ hidden_concat: bool = False,
102
+ ) -> torch.Tensor:
103
+ """
104
+ Extracts the embeddings for the given text.
105
+ :param text: The text to extract embeddings for.
106
+ :param hidden: The hidden layer(s) to use for a readout handler.
107
+ :param squeeze: If we should squeeze the outputs (required for some layers).
108
+ :param reduce_option: How we should reduce the items.
109
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
110
+ :return: A torch vector.
111
+ """
112
+ tokens_tensor = self.tokenize_input(text)
113
+ pooled, hidden_states = self.model(tokens_tensor)[-2:]
114
+
115
+ # deprecated temporary keyword functions.
116
+ if reduce_option == 'concat_last_4':
117
+ last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
118
+ cat_hidden_states = torch.cat(tuple(last_4), dim=-1)
119
+ return torch.mean(cat_hidden_states, dim=1).squeeze()
120
+
121
+ elif reduce_option == 'reduce_last_4':
122
+ last_4 = [hidden_states[i] for i in (-1, -2, -3, -4)]
123
+ return torch.cat(tuple(last_4), dim=1).mean(axis=1).squeeze()
124
+
125
+ elif type(hidden) == int:
126
+ hidden_s = hidden_states[hidden]
127
+ return self._pooled_handler(hidden_s, reduce_option)
128
+
129
+ elif hidden_concat:
130
+ last_states = [hidden_states[i] for i in hidden]
131
+ cat_hidden_states = torch.cat(tuple(last_states), dim=-1)
132
+ return torch.mean(cat_hidden_states, dim=1).squeeze()
133
+
134
+ last_states = [hidden_states[i] for i in hidden]
135
+ hidden_s = torch.cat(tuple(last_states), dim=1)
136
+
137
+ return self._pooled_handler(hidden_s, reduce_option)
138
+
139
+ def create_matrix(
140
+ self,
141
+ content: List[str],
142
+ hidden: Union[List[int], int] = -2,
143
+ reduce_option: str = 'mean',
144
+ hidden_concat: bool = False,
145
+ ) -> ndarray:
146
+ """
147
+ Create matrix from the embeddings.
148
+ :param content: The list of sentences.
149
+ :param hidden: Which hidden layer to use.
150
+ :param reduce_option: The reduce option to run.
151
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
152
+ :return: A numpy array matrix of the given content.
153
+ """
154
+
155
+ return np.asarray([
156
+ np.squeeze(self.extract_embeddings(
157
+ t, hidden=hidden, reduce_option=reduce_option, hidden_concat=hidden_concat
158
+ ).data.cpu().numpy()) for t in content
159
+ ])
160
+
161
+ def __call__(
162
+ self,
163
+ content: List[str],
164
+ hidden: int = -2,
165
+ reduce_option: str = 'mean',
166
+ hidden_concat: bool = False,
167
+ ) -> ndarray:
168
+ """
169
+ Create matrix from the embeddings.
170
+ :param content: The list of sentences.
171
+ :param hidden: Which hidden layer to use.
172
+ :param reduce_option: The reduce option to run.
173
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
174
+ :return: A numpy array matrix of the given content.
175
+ """
176
+ return self.create_matrix(content, hidden, reduce_option, hidden_concat)
extractive_summarizer/cluster_features.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import numpy as np
4
+ from numpy import ndarray
5
+ from sklearn.cluster import KMeans
6
+ from sklearn.decomposition import PCA
7
+ from sklearn.mixture import GaussianMixture
8
+
9
+
10
+ class ClusterFeatures(object):
11
+ """
12
+ Basic handling of clustering features.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ features: ndarray,
18
+ algorithm: str = 'kmeans',
19
+ pca_k: int = None,
20
+ random_state: int = 12345,
21
+ ):
22
+ """
23
+ :param features: the embedding matrix created by bert parent.
24
+ :param algorithm: Which clustering algorithm to use.
25
+ :param pca_k: If you want the features to be ran through pca, this is the components number.
26
+ :param random_state: Random state.
27
+ """
28
+ if pca_k:
29
+ self.features = PCA(n_components=pca_k).fit_transform(features)
30
+ else:
31
+ self.features = features
32
+
33
+ self.algorithm = algorithm
34
+ self.pca_k = pca_k
35
+ self.random_state = random_state
36
+
37
+ def __get_model(self, k: int):
38
+ """
39
+ Retrieve clustering model.
40
+
41
+ :param k: amount of clusters.
42
+ :return: Clustering model.
43
+ """
44
+
45
+ if self.algorithm == 'gmm':
46
+ return GaussianMixture(n_components=k, random_state=self.random_state)
47
+ return KMeans(n_clusters=k, random_state=self.random_state)
48
+
49
+ def __get_centroids(self, model):
50
+ """
51
+ Retrieve centroids of model.
52
+
53
+ :param model: Clustering model.
54
+ :return: Centroids.
55
+ """
56
+ if self.algorithm == 'gmm':
57
+ return model.means_
58
+ return model.cluster_centers_
59
+
60
+ def __find_closest_args(self, centroids: np.ndarray) -> Dict:
61
+ """
62
+ Find the closest arguments to centroid.
63
+
64
+ :param centroids: Centroids to find closest.
65
+ :return: Closest arguments.
66
+ """
67
+ centroid_min = 1e10
68
+ cur_arg = -1
69
+ args = {}
70
+ used_idx = []
71
+
72
+ for j, centroid in enumerate(centroids):
73
+
74
+ for i, feature in enumerate(self.features):
75
+ value = np.linalg.norm(feature - centroid)
76
+
77
+ if value < centroid_min and i not in used_idx:
78
+ cur_arg = i
79
+ centroid_min = value
80
+
81
+ used_idx.append(cur_arg)
82
+ args[j] = cur_arg
83
+ centroid_min = 1e10
84
+ cur_arg = -1
85
+
86
+ return args
87
+
88
+ def calculate_elbow(self, k_max: int) -> List[float]:
89
+ """
90
+ Calculates elbow up to the provided k_max.
91
+
92
+ :param k_max: K_max to calculate elbow for.
93
+ :return: The inertias up to k_max.
94
+ """
95
+ inertias = []
96
+
97
+ for k in range(1, min(k_max, len(self.features))):
98
+ model = self.__get_model(k).fit(self.features)
99
+
100
+ inertias.append(model.inertia_)
101
+
102
+ return inertias
103
+
104
+ def calculate_optimal_cluster(self, k_max: int):
105
+ """
106
+ Calculates the optimal cluster based on Elbow.
107
+
108
+ :param k_max: The max k to search elbow for.
109
+ :return: The optimal cluster size.
110
+ """
111
+ delta_1 = []
112
+ delta_2 = []
113
+
114
+ max_strength = 0
115
+ k = 1
116
+
117
+ inertias = self.calculate_elbow(k_max)
118
+
119
+ for i in range(len(inertias)):
120
+ delta_1.append(inertias[i] - inertias[i - 1] if i > 0 else 0.0)
121
+ delta_2.append(delta_1[i] - delta_1[i - 1] if i > 1 else 0.0)
122
+
123
+ for j in range(len(inertias)):
124
+ strength = 0 if j <= 1 or j == len(inertias) - 1 else delta_2[j + 1] - delta_1[j + 1]
125
+
126
+ if strength > max_strength:
127
+ max_strength = strength
128
+ k = j + 1
129
+
130
+ return k
131
+
132
+ def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
133
+ """
134
+ Clusters sentences based on the ratio.
135
+
136
+ :param ratio: Ratio to use for clustering.
137
+ :param num_sentences: Number of sentences. Overrides ratio.
138
+ :return: Sentences index that qualify for summary.
139
+ """
140
+
141
+ if num_sentences is not None:
142
+ if num_sentences == 0:
143
+ return []
144
+
145
+ k = min(num_sentences, len(self.features))
146
+ else:
147
+ k = max(int(len(self.features) * ratio), 1)
148
+
149
+ model = self.__get_model(k).fit(self.features)
150
+
151
+ centroids = self.__get_centroids(model)
152
+ cluster_args = self.__find_closest_args(centroids)
153
+
154
+ sorted_values = sorted(cluster_args.values())
155
+ return sorted_values
156
+
157
+ def __call__(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
158
+ """
159
+ Clusters sentences based on the ratio.
160
+
161
+ :param ratio: Ratio to use for clustering.
162
+ :param num_sentences: Number of sentences. Overrides ratio.
163
+ :return: Sentences index that qualify for summary.
164
+ """
165
+ return self.cluster(ratio)
extractive_summarizer/model_processors.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+
3
+ import numpy as np
4
+ from transformers import (AlbertModel, AlbertTokenizer, BartModel,
5
+ BartTokenizer, BertModel, BertTokenizer,
6
+ CamembertModel, CamembertTokenizer, CTRLModel,
7
+ CTRLTokenizer, DistilBertModel, DistilBertTokenizer,
8
+ GPT2Model, GPT2Tokenizer, LongformerModel,
9
+ LongformerTokenizer, OpenAIGPTModel,
10
+ OpenAIGPTTokenizer, PreTrainedModel,
11
+ PreTrainedTokenizer, RobertaModel, RobertaTokenizer,
12
+ TransfoXLModel, TransfoXLTokenizer, XLMModel,
13
+ XLMTokenizer, XLNetModel, XLNetTokenizer)
14
+
15
+ from extractive_summarizer.bert_parent import BertParent
16
+ from extractive_summarizer.cluster_features import ClusterFeatures
17
+ from extractive_summarizer.sentence_handler import SentenceHandler
18
+
19
+
20
+ class ModelProcessor(object):
21
+ aggregate_map = {
22
+ 'mean': np.mean,
23
+ 'min': np.min,
24
+ 'median': np.median,
25
+ 'max': np.max,
26
+ }
27
+
28
+ def __init__(
29
+ self,
30
+ model: str = 'bert-large-uncased',
31
+ custom_model: PreTrainedModel = None,
32
+ custom_tokenizer: PreTrainedTokenizer = None,
33
+ hidden: Union[List[int], int] = -2,
34
+ reduce_option: str = 'mean',
35
+ sentence_handler: SentenceHandler = SentenceHandler(),
36
+ random_state: int = 12345,
37
+ hidden_concat: bool = False,
38
+ gpu_id: int = 0,
39
+ ):
40
+ """
41
+ This is the parent Bert Summarizer model. New methods should implement this class.
42
+
43
+ :param model: This parameter is associated with the inherit string parameters from the transformers library.
44
+ :param custom_model: If you have a pre-trained model, you can add the model class here.
45
+ :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
46
+ :param hidden: This signifies which layer(s) of the BERT model you would like to use as embeddings.
47
+ :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
48
+ :param sentence_handler: The handler to process sentences. If want to use coreference, instantiate and pass.
49
+ CoreferenceHandler instance
50
+ :param random_state: The random state to reproduce summarizations.
51
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
52
+ :param gpu_id: GPU device index if CUDA is available.
53
+ """
54
+ np.random.seed(random_state)
55
+ self.model = BertParent(model, custom_model, custom_tokenizer, gpu_id)
56
+ self.hidden = hidden
57
+ self.reduce_option = reduce_option
58
+ self.sentence_handler = sentence_handler
59
+ self.random_state = random_state
60
+ self.hidden_concat = hidden_concat
61
+
62
+ def cluster_runner(
63
+ self,
64
+ content: List[str],
65
+ ratio: float = 0.2,
66
+ algorithm: str = 'kmeans',
67
+ use_first: bool = True,
68
+ num_sentences: int = None
69
+ ) -> Tuple[List[str], np.ndarray]:
70
+ """
71
+ Runs the cluster algorithm based on the hidden state. Returns both the embeddings and sentences.
72
+
73
+ :param content: Content list of sentences.
74
+ :param ratio: The ratio to use for clustering.
75
+ :param algorithm: Type of algorithm to use for clustering.
76
+ :param use_first: Return the first sentence in the output (helpful for news stories, etc).
77
+ :param num_sentences: Number of sentences to use for summarization.
78
+ :return: A tuple of summarized sentences and embeddings
79
+ """
80
+ if num_sentences is not None:
81
+ num_sentences = num_sentences if use_first else num_sentences
82
+
83
+ hidden = self.model(
84
+ content, self.hidden, self.reduce_option, hidden_concat=self.hidden_concat)
85
+ hidden_args = ClusterFeatures(
86
+ hidden, algorithm, random_state=self.random_state).cluster(ratio, num_sentences)
87
+
88
+ if use_first:
89
+
90
+ if not hidden_args:
91
+ hidden_args.append(0)
92
+
93
+ elif hidden_args[0] != 0:
94
+ hidden_args.insert(0, 0)
95
+
96
+ sentences = [content[j] for j in hidden_args]
97
+ embeddings = np.asarray([hidden[j] for j in hidden_args])
98
+
99
+ return sentences, embeddings
100
+
101
+ def __run_clusters(
102
+ self,
103
+ content: List[str],
104
+ ratio: float = 0.2,
105
+ algorithm: str = 'kmeans',
106
+ use_first: bool = True,
107
+ num_sentences: int = None
108
+ ) -> List[str]:
109
+ """
110
+ Runs clusters and returns sentences.
111
+
112
+ :param content: The content of sentences.
113
+ :param ratio: Ratio to use for for clustering.
114
+ :param algorithm: Algorithm selection for clustering.
115
+ :param use_first: Whether to use first sentence
116
+ :param num_sentences: Number of sentences. Overrides ratio.
117
+ :return: summarized sentences
118
+ """
119
+ sentences, _ = self.cluster_runner(
120
+ content, ratio, algorithm, use_first, num_sentences)
121
+ return sentences
122
+
123
+ def __retrieve_summarized_embeddings(
124
+ self,
125
+ content: List[str],
126
+ ratio: float = 0.2,
127
+ algorithm: str = 'kmeans',
128
+ use_first: bool = True,
129
+ num_sentences: int = None
130
+ ) -> np.ndarray:
131
+ """
132
+ Retrieves embeddings of the summarized sentences.
133
+
134
+ :param content: The content of sentences.
135
+ :param ratio: Ratio to use for for clustering.
136
+ :param algorithm: Algorithm selection for clustering.
137
+ :param use_first: Whether to use first sentence
138
+ :return: Summarized embeddings
139
+ """
140
+ _, embeddings = self.cluster_runner(
141
+ content, ratio, algorithm, use_first, num_sentences)
142
+ return embeddings
143
+
144
+ def calculate_elbow(
145
+ self,
146
+ body: str,
147
+ algorithm: str = 'kmeans',
148
+ min_length: int = 40,
149
+ max_length: int = 600,
150
+ k_max: int = None,
151
+ ) -> List[float]:
152
+ """
153
+ Calculates elbow across the clusters.
154
+
155
+ :param body: The input body to summarize.
156
+ :param algorithm: The algorithm to use for clustering.
157
+ :param min_length: The min length to use.
158
+ :param max_length: The max length to use.
159
+ :param k_max: The maximum number of clusters to search.
160
+ :return: List of elbow inertia values.
161
+ """
162
+ sentences = self.sentence_handler(body, min_length, max_length)
163
+
164
+ if k_max is None:
165
+ k_max = len(sentences) - 1
166
+
167
+ hidden = self.model(sentences, self.hidden,
168
+ self.reduce_option, hidden_concat=self.hidden_concat)
169
+ elbow = ClusterFeatures(
170
+ hidden, algorithm, random_state=self.random_state).calculate_elbow(k_max)
171
+
172
+ return elbow
173
+
174
+ def calculate_optimal_k(
175
+ self,
176
+ body: str,
177
+ algorithm: str = 'kmeans',
178
+ min_length: int = 40,
179
+ max_length: int = 600,
180
+ k_max: int = None,
181
+ ):
182
+ """
183
+ Calculates the optimal Elbow K.
184
+
185
+ :param body: The input body to summarize.
186
+ :param algorithm: The algorithm to use for clustering.
187
+ :param min_length: The min length to use.
188
+ :param max_length: The max length to use.
189
+ :param k_max: The maximum number of clusters to search.
190
+ :return:
191
+ """
192
+ sentences = self.sentence_handler(body, min_length, max_length)
193
+
194
+ if k_max is None:
195
+ k_max = len(sentences) - 1
196
+
197
+ hidden = self.model(sentences, self.hidden,
198
+ self.reduce_option, hidden_concat=self.hidden_concat)
199
+ optimal_k = ClusterFeatures(
200
+ hidden, algorithm, random_state=self.random_state).calculate_optimal_cluster(k_max)
201
+
202
+ return optimal_k
203
+
204
+ def run_embeddings(
205
+ self,
206
+ body: str,
207
+ ratio: float = 0.2,
208
+ min_length: int = 40,
209
+ max_length: int = 600,
210
+ use_first: bool = True,
211
+ algorithm: str = 'kmeans',
212
+ num_sentences: int = None,
213
+ aggregate: str = None,
214
+ ) -> Optional[np.ndarray]:
215
+ """
216
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the embeddings.
217
+
218
+ :param body: The raw string body to process
219
+ :param ratio: Ratio of sentences to use
220
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
221
+ :param max_length: Maximum length of sentence candidates to utilize for the summary
222
+ :param use_first: Whether or not to use the first sentence
223
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
224
+ :param num_sentences: Number of sentences to use. Overrides ratio.
225
+ :param aggregate: One of mean, median, max, min. Applied on zero axis
226
+ :return: A summary embedding
227
+ """
228
+ sentences = self.sentence_handler(body, min_length, max_length)
229
+
230
+ if sentences:
231
+ embeddings = self.__retrieve_summarized_embeddings(
232
+ sentences, ratio, algorithm, use_first, num_sentences)
233
+
234
+ if aggregate is not None:
235
+ assert aggregate in [
236
+ 'mean', 'median', 'max', 'min'], "aggregate must be mean, min, max, or median"
237
+ embeddings = self.aggregate_map[aggregate](embeddings, axis=0)
238
+
239
+ return embeddings
240
+
241
+ return None
242
+
243
+ def run(
244
+ self,
245
+ body: str,
246
+ ratio: float = 0.2,
247
+ min_length: int = 40,
248
+ max_length: int = 600,
249
+ use_first: bool = True,
250
+ algorithm: str = 'kmeans',
251
+ num_sentences: int = None,
252
+ return_as_list: bool = False
253
+ ) -> Union[List, str]:
254
+ """
255
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
256
+
257
+ :param body: The raw string body to process
258
+ :param ratio: Ratio of sentences to use
259
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
260
+ :param max_length: Maximum length of sentence candidates to utilize for the summary
261
+ :param use_first: Whether or not to use the first sentence
262
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
263
+ :param num_sentences: Number of sentences to use (overrides ratio).
264
+ :param return_as_list: Whether or not to return sentences as list.
265
+ :return: A summary sentence
266
+ """
267
+ sentences = self.sentence_handler(body, min_length, max_length)
268
+
269
+ if sentences:
270
+ sentences = self.__run_clusters(
271
+ sentences, ratio, algorithm, use_first, num_sentences)
272
+
273
+ if return_as_list:
274
+ return sentences
275
+ else:
276
+ return ' '.join(sentences)
277
+
278
+ def __call__(
279
+ self,
280
+ body: str,
281
+ ratio: float = 0.2,
282
+ min_length: int = 40,
283
+ max_length: int = 600,
284
+ use_first: bool = True,
285
+ algorithm: str = 'kmeans',
286
+ num_sentences: int = None,
287
+ return_as_list: bool = False,
288
+ ) -> str:
289
+ """
290
+ (utility that wraps around the run function)
291
+ Preprocesses the sentences, runs the clusters to find the centroids, then combines the sentences.
292
+
293
+ :param body: The raw string body to process.
294
+ :param ratio: Ratio of sentences to use.
295
+ :param min_length: Minimum length of sentence candidates to utilize for the summary.
296
+ :param max_length: Maximum length of sentence candidates to utilize for the summary.
297
+ :param use_first: Whether or not to use the first sentence.
298
+ :param algorithm: Which clustering algorithm to use. (kmeans, gmm)
299
+ :param Number of sentences to use (overrides ratio).
300
+ :param return_as_list: Whether or not to return sentences as list.
301
+ :return: A summary sentence.
302
+ """
303
+ return self.run(
304
+ body, ratio, min_length, max_length, algorithm=algorithm, use_first=use_first, num_sentences=num_sentences,
305
+ return_as_list=return_as_list
306
+ )
307
+
308
+
309
+ class Summarizer(ModelProcessor):
310
+
311
+ def __init__(
312
+ self,
313
+ model: str = 'bert-large-uncased',
314
+ custom_model: PreTrainedModel = None,
315
+ custom_tokenizer: PreTrainedTokenizer = None,
316
+ hidden: Union[List[int], int] = -2,
317
+ reduce_option: str = 'mean',
318
+ sentence_handler: SentenceHandler = SentenceHandler(),
319
+ random_state: int = 12345,
320
+ hidden_concat: bool = False,
321
+ gpu_id: int = 0,
322
+ ):
323
+ """
324
+ This is the main Bert Summarizer class.
325
+
326
+ :param model: This parameter is associated with the inherit string parameters from the transformers library.
327
+ :param custom_model: If you have a pre-trained model, you can add the model class here.
328
+ :param custom_tokenizer: If you have a custom tokenizer, you can add the tokenizer here.
329
+ :param hidden: This signifies which layer of the BERT model you would like to use as embeddings.
330
+ :param reduce_option: Given the output of the bert model, this param determines how you want to reduce results.
331
+ :param greedyness: associated with the neuralcoref library. Determines how greedy coref should be.
332
+ :param language: Which language to use for training.
333
+ :param random_state: The random state to reproduce summarizations.
334
+ :param hidden_concat: Whether or not to concat multiple hidden layers.
335
+ :param gpu_id: GPU device index if CUDA is available.
336
+ """
337
+
338
+ super(Summarizer, self).__init__(
339
+ model, custom_model, custom_tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
340
+ )
341
+
342
+
343
+ class TransformerSummarizer(ModelProcessor):
344
+ """
345
+ Another type of Summarizer class to choose keyword based model and tokenizer
346
+ """
347
+
348
+ MODEL_DICT = {
349
+ 'Bert': (BertModel, BertTokenizer),
350
+ 'OpenAIGPT': (OpenAIGPTModel, OpenAIGPTTokenizer),
351
+ 'GPT2': (GPT2Model, GPT2Tokenizer),
352
+ 'CTRL': (CTRLModel, CTRLTokenizer),
353
+ 'TransfoXL': (TransfoXLModel, TransfoXLTokenizer),
354
+ 'XLNet': (XLNetModel, XLNetTokenizer),
355
+ 'XLM': (XLMModel, XLMTokenizer),
356
+ 'DistilBert': (DistilBertModel, DistilBertTokenizer),
357
+ }
358
+
359
+ def __init__(
360
+ self,
361
+ transformer_type: str = 'Bert',
362
+ transformer_model_key: str = 'bert-base-uncased',
363
+ transformer_tokenizer_key: str = None,
364
+ hidden: Union[List[int], int] = -2,
365
+ reduce_option: str = 'mean',
366
+ sentence_handler: SentenceHandler = SentenceHandler(),
367
+ random_state: int = 12345,
368
+ hidden_concat: bool = False,
369
+ gpu_id: int = 0,
370
+ ):
371
+ """
372
+ :param transformer_type: The Transformer type, such as Bert, GPT2, DistilBert, etc.
373
+ :param transformer_model_key: The transformer model key. This is the directory for the model.
374
+ :param transformer_tokenizer_key: The transformer tokenizer key. This is the tokenizer directory.
375
+ :param hidden: The hidden output layers to use for the summarization.
376
+ :param reduce_option: The reduce option, such as mean, max, min, median, etc.
377
+ :param sentence_handler: The sentence handler class to process the raw text.
378
+ :param random_state: The random state to use.
379
+ :param hidden_concat: Deprecated hidden concat option.
380
+ :param gpu_id: GPU device index if CUDA is available.
381
+ """
382
+ try:
383
+ self.MODEL_DICT['Roberta'] = (RobertaModel, RobertaTokenizer)
384
+ self.MODEL_DICT['Albert'] = (AlbertModel, AlbertTokenizer)
385
+ self.MODEL_DICT['Camembert'] = (CamembertModel, CamembertTokenizer)
386
+ self.MODEL_DICT['Bart'] = (BartModel, BartTokenizer)
387
+ self.MODEL_DICT['Longformer'] = (LongformerModel, LongformerTokenizer)
388
+ except Exception:
389
+ pass # older transformer version
390
+
391
+ model_clz, tokenizer_clz = self.MODEL_DICT[transformer_type]
392
+ model = model_clz.from_pretrained(
393
+ transformer_model_key, output_hidden_states=True)
394
+
395
+ tokenizer = tokenizer_clz.from_pretrained(
396
+ transformer_tokenizer_key if transformer_tokenizer_key is not None else transformer_model_key
397
+ )
398
+
399
+ super().__init__(
400
+ None, model, tokenizer, hidden, reduce_option, sentence_handler, random_state, hidden_concat, gpu_id
401
+ )
extractive_summarizer/sentence_handler.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from spacy.lang.en import English
4
+
5
+
6
+ class SentenceHandler(object):
7
+
8
+ def __init__(self, language=English):
9
+ """
10
+ Base Sentence Handler with Spacy support.
11
+
12
+ :param language: Determines the language to use with spacy.
13
+ """
14
+ self.nlp = language()
15
+
16
+ try:
17
+ # Supports spacy 2.0
18
+ self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
19
+ self.is_spacy_3 = False
20
+ except Exception:
21
+ # Supports spacy 3.0
22
+ self.nlp.add_pipe("sentencizer")
23
+ self.is_spacy_3 = True
24
+
25
+ def sentence_processor(self, doc,
26
+ min_length: int = 40,
27
+ max_length: int = 600) -> List[str]:
28
+ """
29
+ Processes a given spacy document and turns them into sentences.
30
+
31
+ :param doc: The document to use from spacy.
32
+ :param min_length: The minimum length a sentence should be to be considered.
33
+ :param max_length: The maximum length a sentence should be to be considered.
34
+ :return: Sentences.
35
+ """
36
+ to_return = []
37
+
38
+ for c in doc.sents:
39
+ if max_length > len(c.text.strip()) > min_length:
40
+
41
+ if self.is_spacy_3:
42
+ to_return.append(c.text.strip())
43
+ else:
44
+ to_return.append(c.string.strip())
45
+
46
+ return to_return
47
+
48
+ def process(self, body: str,
49
+ min_length: int = 40,
50
+ max_length: int = 600) -> List[str]:
51
+ """
52
+ Processes the content sentences.
53
+
54
+ :param body: The raw string body to process
55
+ :param min_length: Minimum length that the sentences must be
56
+ :param max_length: Max length that the sentences mus fall under
57
+ :return: Returns a list of sentences.
58
+ """
59
+ doc = self.nlp(body)
60
+ return self.sentence_processor(doc, min_length, max_length)
61
+
62
+ def __call__(self, body: str,
63
+ min_length: int = 40,
64
+ max_length: int = 600) -> List[str]:
65
+ """
66
+ Processes the content sentences.
67
+
68
+ :param body: The raw string body to process
69
+ :param min_length: Minimum length that the sentences must be
70
+ :param max_length: Max length that the sentences mus fall under
71
+ :return: Returns a list of sentences.
72
+ """
73
+ return self.process(body, min_length, max_length)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ torch
3
+ spacy
4
+ scikit-learn
5
+ transformers
6
+ streamlit
7
+ sentencepiece
8
+ beautifulsoup4
9
+ nltk
10
+ PyPDF2
11
+ docx2txt
12
+ rouge
13
+ altair==4.0
utils.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import docx2txt
4
+ from io import StringIO
5
+ from PyPDF2 import PdfFileReader
6
+
7
+ from bs4 import BeautifulSoup
8
+ from nltk.tokenize import sent_tokenize
9
+
10
+ emoji_pattern = re.compile(
11
+ "["
12
+ u"\U0001F600-\U0001F64F" # emoticons
13
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
14
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
15
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
16
+ u"\U00002702-\U000027B0"
17
+ u"\U000024C2-\U0001F251"
18
+ "]+",
19
+ flags=re.UNICODE,
20
+ )
21
+
22
+
23
+ def clean_text(x):
24
+ # x = x.lower() # lowercase
25
+ x = x.encode("ascii", "ignore").decode() # unicode
26
+ x = re.sub(r"https*\S+", " ", x) # url
27
+ x = re.sub(r"@\S+", " ", x) # mentions
28
+ x = re.sub(r"#\S+", " ", x) # hastags
29
+ # x = x.replace("'", "") # remove ticks
30
+ # x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
31
+ # x = re.sub(r"\w*\d+\w*", "", x) # numbers
32
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
33
+ x = emoji_pattern.sub(r"", x) # emojis
34
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
35
+
36
+ return x
37
+
38
+
39
+ def fetch_article_text(url: str):
40
+
41
+ r = requests.get(url)
42
+ soup = BeautifulSoup(r.text, "html.parser")
43
+ results = soup.find_all(["h1", "p"])
44
+ text = [result.text for result in results]
45
+ ARTICLE = " ".join(text)
46
+ ARTICLE = ARTICLE.replace(".", ".<eos>")
47
+ ARTICLE = ARTICLE.replace("!", "!<eos>")
48
+ ARTICLE = ARTICLE.replace("?", "?<eos>")
49
+ sentences = ARTICLE.split("<eos>")
50
+ current_chunk = 0
51
+ chunks = []
52
+ for sentence in sentences:
53
+ if len(chunks) == current_chunk + 1:
54
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
55
+ chunks[current_chunk].extend(sentence.split(" "))
56
+ else:
57
+ current_chunk += 1
58
+ chunks.append(sentence.split(" "))
59
+ else:
60
+ print(current_chunk)
61
+ chunks.append(sentence.split(" "))
62
+
63
+ for chunk_id in range(len(chunks)):
64
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
65
+
66
+ return ARTICLE, chunks
67
+
68
+
69
+ def preprocess_text_for_abstractive_summarization(tokenizer, text):
70
+ sentences = sent_tokenize(text)
71
+
72
+ # initialize
73
+ length = 0
74
+ chunk = ""
75
+ chunks = []
76
+ count = -1
77
+ for sentence in sentences:
78
+ count += 1
79
+ combined_length = (
80
+ len(tokenizer.tokenize(sentence)) + length
81
+ ) # add the no. of sentence tokens to the length counter
82
+
83
+ if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
84
+ chunk += sentence + " " # add the sentence to the chunk
85
+ length = combined_length # update the length counter
86
+
87
+ # if it is the last sentence
88
+ if count == len(sentences) - 1:
89
+ chunks.append(chunk.strip()) # save the chunk
90
+
91
+ else:
92
+ chunks.append(chunk.strip()) # save the chunk
93
+
94
+ # reset
95
+ length = 0
96
+ chunk = ""
97
+
98
+ # take care of the overflow sentence
99
+ chunk += sentence + " "
100
+ length = len(tokenizer.tokenize(sentence))
101
+
102
+ return chunks
103
+
104
+
105
+ def read_pdf(file):
106
+ pdfReader = PdfFileReader(file)
107
+ count = pdfReader.numPages
108
+ all_page_text = ""
109
+ for i in range(count):
110
+ page = pdfReader.getPage(i)
111
+ all_page_text += page.extractText()
112
+
113
+ return all_page_text
114
+
115
+
116
+ def read_text_from_file(file):
117
+
118
+ # read text file
119
+ if file.type == "text/plain":
120
+ # To convert to a string based IO:
121
+ stringio = StringIO(file.getvalue().decode("utf-8"))
122
+
123
+ # To read file as string:
124
+ file_content = stringio.read()
125
+
126
+ # read pdf file
127
+ elif file.type == "application/pdf":
128
+ file_content = read_pdf(file)
129
+
130
+ # read docx file
131
+ elif (
132
+ file.type
133
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
134
+ ):
135
+ file_content = docx2txt.process(file)
136
+
137
+ return file_content