nickmuchi commited on
Commit
6a43aa5
1 Parent(s): 6a51db7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -252
app.py CHANGED
@@ -1,252 +1,252 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # In[1]:
5
-
6
-
7
- import validators, re
8
- from fake_useragent import UserAgent
9
- from bs4 import BeautifulSoup
10
- import streamlit as st
11
- from transformers import pipeline
12
- import time
13
- import base64
14
- import requests
15
- import docx2txt
16
- from io import StringIO
17
- from PyPDF2 import PdfFileReader
18
- import warnings
19
- warnings.filterwarnings("ignore")
20
-
21
-
22
- # In[2]:
23
-
24
- time_str = time.strftime("%d%m%Y-%H%M%S")
25
- #Functions
26
-
27
- def article_text_extractor(url: str):
28
-
29
- '''Extract text from url and divide text into chunks if length of text is more than 500 words'''
30
-
31
- ua = UserAgent()
32
-
33
- headers = {'User-Agent':str(ua.chrome)}
34
-
35
- r = requests.get(url,headers=headers)
36
-
37
- soup = BeautifulSoup(r.text, "html.parser")
38
- title_text = soup.find_all(["h1"])
39
- para_text = soup.find_all(["p"])
40
- article_text = [result.text for result in para_text]
41
- article_header = [result.text for result in title_text][0]
42
- article = " ".join(article_text)
43
- article = article.replace(".", ".<eos>")
44
- article = article.replace("!", "!<eos>")
45
- article = article.replace("?", "?<eos>")
46
- sentences = article.split("<eos>")
47
-
48
- current_chunk = 0
49
- chunks = []
50
-
51
- for sentence in sentences:
52
- if len(chunks) == current_chunk + 1:
53
- if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 600:
54
- chunks[current_chunk].extend(sentence.split(" "))
55
- else:
56
- current_chunk += 1
57
- chunks.append(sentence.split(" "))
58
- else:
59
- print(current_chunk)
60
- chunks.append(sentence.split(" "))
61
-
62
- for chunk_id in range(len(chunks)):
63
- chunks[chunk_id] = " ".join(chunks[chunk_id])
64
-
65
- return article_header, chunks
66
-
67
- def preprocess_plain_text(x):
68
-
69
- x = x.encode("ascii", "ignore").decode() # unicode
70
- x = re.sub(r"https*\S+", " ", x) # url
71
- x = re.sub(r"@\S+", " ", x) # mentions
72
- x = re.sub(r"#\S+", " ", x) # hastags
73
- x = re.sub(r"\s{2,}", " ", x) # over spaces
74
- x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
75
-
76
- return x
77
-
78
- def extract_pdf(file):
79
-
80
- '''Extract text from PDF file'''
81
-
82
- pdfReader = PdfFileReader(file)
83
- count = pdfReader.numPages
84
- all_text = ""
85
- for i in range(count):
86
- page = pdfReader.getPage(i)
87
- all_text += page.extractText()
88
-
89
- return all_text
90
-
91
-
92
- def extract_text_from_file(file):
93
-
94
- '''Extract text from uploaded file'''
95
-
96
- # read text file
97
- if file.type == "text/plain":
98
- # To convert to a string based IO:
99
- stringio = StringIO(file.getvalue().decode("utf-8"))
100
-
101
- # To read file as string:
102
- file_text = stringio.read()
103
-
104
- # read pdf file
105
- elif file.type == "application/pdf":
106
- file_text = extract_pdf(file)
107
-
108
- # read docx file
109
- elif (
110
- file.type
111
- == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
112
- ):
113
- file_text = docx2txt.process(file)
114
-
115
- return file_text
116
-
117
- def summary_downloader(raw_text):
118
-
119
- b64 = base64.b64encode(raw_text.encode()).decode()
120
- new_filename = "new_text_file_{}_.txt".format(time_str)
121
- st.markdown("#### Download Summary as a File ###")
122
- href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
123
- st.markdown(href,unsafe_allow_html=True)
124
-
125
- @st.cache(allow_output_mutation=True)
126
- def facebook_model():
127
-
128
- summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
129
- return summarizer
130
-
131
- @st.cache(allow_output_mutation=True)
132
- def schleifer_model():
133
-
134
- summarizer = pipeline('summarization',model='sshleifer/distilbart-cnn-12-6')
135
- return summarizer
136
-
137
- #Streamlit App
138
-
139
- st.title("Article Text and Link Extractive Summarizer 📝")
140
-
141
- model_type = st.sidebar.selectbox(
142
- "Model type", options=["Facebook-Bart", "Sshleifer-DistilBart"]
143
- )
144
-
145
- st.markdown(
146
- "Model Source: [Facebook-Bart-large-CNN](https://huggingface.co/facebook/bart-large-cnn) and [Sshleifer-distilbart-cnn-12-6](https://huggingface.co/sshleifer/distilbart-cnn-12-6)"
147
- )
148
-
149
- st.markdown(
150
- """The app supports extractive summarization which aims to identify the salient information that is then extracted and grouped together to form a concise summary.
151
- For documents or text that is more than 500 words long, the app will divide the text into chunks and summarize each chunk.
152
- There are two models available to choose from:""")
153
-
154
- st.markdown("""
155
- - Facebook-Bart, trained on large [CNN and Daily Mail](https://huggingface.co/datasets/cnn_dailymail) news articles.
156
- - Sshleifer-Distilbart, which is a distilled (smaller) version of the large Bart model."""
157
- )
158
-
159
- st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
160
-
161
- st.markdown(
162
- "The app only ingests the below formats for summarization task:"
163
- )
164
- st.markdown(
165
- """- Raw text entered in text box.
166
- - URL of an article to be summarized.
167
- - Documents with .txt, .pdf or .docx file formats."""
168
- )
169
-
170
- st.markdown("---")
171
-
172
- url_text = st.text_input("Please Enter a url here")
173
-
174
-
175
- st.markdown(
176
- "<h3 style='text-align: center; color: red;'>OR</h3>",
177
- unsafe_allow_html=True,
178
- )
179
-
180
- plain_text = st.text_input("Please Paste/Enter plain text here")
181
-
182
- st.markdown(
183
- "<h3 style='text-align: center; color: red;'>OR</h3>",
184
- unsafe_allow_html=True,
185
- )
186
-
187
- upload_doc = st.file_uploader(
188
- "Upload a .txt, .pdf, .docx file for summarization"
189
- )
190
-
191
- is_url = validators.url(url_text)
192
-
193
- if is_url:
194
- # complete text, chunks to summarize (list of sentences for long docs)
195
- article_title,chunks = article_text_extractor(url=url_text)
196
-
197
- elif upload_doc:
198
-
199
- clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
200
-
201
- else:
202
-
203
- clean_text = preprocess_plain_text(plain_text)
204
-
205
- summarize = st.button("Summarize")
206
-
207
- # called on toggle button [summarize]
208
- if summarize:
209
- if model_type == "Facebook-Bart":
210
- if is_url:
211
- text_to_summarize = chunks
212
- else:
213
- text_to_summarize = clean_text
214
-
215
- with st.spinner(
216
- text="Loading Facebook-Bart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
217
- ):
218
- summarizer_model = facebook_model()
219
- summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
220
- summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
221
-
222
- elif model_type == "Sshleifer-DistilBart":
223
- if is_url:
224
- text_to_summarize = chunks
225
- else:
226
- text_to_summarize = clean_text
227
-
228
- with st.spinner(
229
- text="Loading Sshleifer-DistilBart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
230
- ):
231
- summarizer_model = schleifer_model()
232
- summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
233
- summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
234
-
235
- # final summarized output
236
- st.subheader("Summarized text")
237
-
238
- if is_url:
239
-
240
- # view summarized text (expander)
241
- st.markdown(f"Article title: {article_title}")
242
-
243
- st.write(summarized_text)
244
-
245
- summary_downloader(summarized_text)
246
-
247
-
248
- # In[ ]:
249
-
250
-
251
-
252
-
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[1]:
5
+
6
+
7
+ import validators, re
8
+ from fake_useragent import UserAgent
9
+ from bs4 import BeautifulSoup
10
+ import streamlit as st
11
+ from transformers import pipeline
12
+ import time
13
+ import base64
14
+ import requests
15
+ import docx2txt
16
+ from io import StringIO
17
+ from PyPDF2 import PdfFileReader
18
+ import warnings
19
+ warnings.filterwarnings("ignore")
20
+
21
+
22
+ # In[2]:
23
+
24
+ time_str = time.strftime("%d%m%Y-%H%M%S")
25
+ #Functions
26
+
27
+ def article_text_extractor(url: str):
28
+
29
+ '''Extract text from url and divide text into chunks if length of text is more than 500 words'''
30
+
31
+ ua = UserAgent()
32
+
33
+ headers = {'User-Agent':str(ua.chrome)}
34
+
35
+ r = requests.get(url,headers=headers)
36
+
37
+ soup = BeautifulSoup(r.text, "html.parser")
38
+ title_text = soup.find_all(["h1"])
39
+ para_text = soup.find_all(["p"])
40
+ article_text = [result.text for result in para_text]
41
+ article_header = [result.text for result in title_text][0]
42
+ article = " ".join(article_text)
43
+ article = article.replace(".", ".<eos>")
44
+ article = article.replace("!", "!<eos>")
45
+ article = article.replace("?", "?<eos>")
46
+ sentences = article.split("<eos>")
47
+
48
+ current_chunk = 0
49
+ chunks = []
50
+
51
+ for sentence in sentences:
52
+ if len(chunks) == current_chunk + 1:
53
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
54
+ chunks[current_chunk].extend(sentence.split(" "))
55
+ else:
56
+ current_chunk += 1
57
+ chunks.append(sentence.split(" "))
58
+ else:
59
+ print(current_chunk)
60
+ chunks.append(sentence.split(" "))
61
+
62
+ for chunk_id in range(len(chunks)):
63
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
64
+
65
+ return article_header, chunks
66
+
67
+ def preprocess_plain_text(x):
68
+
69
+ x = x.encode("ascii", "ignore").decode() # unicode
70
+ x = re.sub(r"https*\S+", " ", x) # url
71
+ x = re.sub(r"@\S+", " ", x) # mentions
72
+ x = re.sub(r"#\S+", " ", x) # hastags
73
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
74
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
75
+
76
+ return x
77
+
78
+ def extract_pdf(file):
79
+
80
+ '''Extract text from PDF file'''
81
+
82
+ pdfReader = PdfFileReader(file)
83
+ count = pdfReader.numPages
84
+ all_text = ""
85
+ for i in range(count):
86
+ page = pdfReader.getPage(i)
87
+ all_text += page.extractText()
88
+
89
+ return all_text
90
+
91
+
92
+ def extract_text_from_file(file):
93
+
94
+ '''Extract text from uploaded file'''
95
+
96
+ # read text file
97
+ if file.type == "text/plain":
98
+ # To convert to a string based IO:
99
+ stringio = StringIO(file.getvalue().decode("utf-8"))
100
+
101
+ # To read file as string:
102
+ file_text = stringio.read()
103
+
104
+ # read pdf file
105
+ elif file.type == "application/pdf":
106
+ file_text = extract_pdf(file)
107
+
108
+ # read docx file
109
+ elif (
110
+ file.type
111
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
112
+ ):
113
+ file_text = docx2txt.process(file)
114
+
115
+ return file_text
116
+
117
+ def summary_downloader(raw_text):
118
+
119
+ b64 = base64.b64encode(raw_text.encode()).decode()
120
+ new_filename = "new_text_file_{}_.txt".format(time_str)
121
+ st.markdown("#### Download Summary as a File ###")
122
+ href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
123
+ st.markdown(href,unsafe_allow_html=True)
124
+
125
+ @st.cache(allow_output_mutation=True)
126
+ def facebook_model():
127
+
128
+ summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
129
+ return summarizer
130
+
131
+ @st.cache(allow_output_mutation=True)
132
+ def schleifer_model():
133
+
134
+ summarizer = pipeline('summarization',model='sshleifer/distilbart-cnn-12-6')
135
+ return summarizer
136
+
137
+ #Streamlit App
138
+
139
+ st.title("Article Text and Link Extractive Summarizer 📝")
140
+
141
+ model_type = st.sidebar.selectbox(
142
+ "Model type", options=["Facebook-Bart", "Sshleifer-DistilBart"]
143
+ )
144
+
145
+ st.markdown(
146
+ "Model Source: [Facebook-Bart-large-CNN](https://huggingface.co/facebook/bart-large-cnn) and [Sshleifer-distilbart-cnn-12-6](https://huggingface.co/sshleifer/distilbart-cnn-12-6)"
147
+ )
148
+
149
+ st.markdown(
150
+ """The app supports extractive summarization which aims to identify the salient information that is then extracted and grouped together to form a concise summary.
151
+ For documents or text that is more than 500 words long, the app will divide the text into chunks and summarize each chunk.
152
+ There are two models available to choose from:""")
153
+
154
+ st.markdown("""
155
+ - Facebook-Bart, trained on large [CNN and Daily Mail](https://huggingface.co/datasets/cnn_dailymail) news articles.
156
+ - Sshleifer-Distilbart, which is a distilled (smaller) version of the large Bart model."""
157
+ )
158
+
159
+ st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
160
+
161
+ st.markdown(
162
+ "The app only ingests the below formats for summarization task:"
163
+ )
164
+ st.markdown(
165
+ """- Raw text entered in text box.
166
+ - URL of an article to be summarized.
167
+ - Documents with .txt, .pdf or .docx file formats."""
168
+ )
169
+
170
+ st.markdown("---")
171
+
172
+ url_text = st.text_input("Please Enter a url here")
173
+
174
+
175
+ st.markdown(
176
+ "<h3 style='text-align: center; color: red;'>OR</h3>",
177
+ unsafe_allow_html=True,
178
+ )
179
+
180
+ plain_text = st.text_input("Please Paste/Enter plain text here")
181
+
182
+ st.markdown(
183
+ "<h3 style='text-align: center; color: red;'>OR</h3>",
184
+ unsafe_allow_html=True,
185
+ )
186
+
187
+ upload_doc = st.file_uploader(
188
+ "Upload a .txt, .pdf, .docx file for summarization"
189
+ )
190
+
191
+ is_url = validators.url(url_text)
192
+
193
+ if is_url:
194
+ # complete text, chunks to summarize (list of sentences for long docs)
195
+ article_title,chunks = article_text_extractor(url=url_text)
196
+
197
+ elif upload_doc:
198
+
199
+ clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
200
+
201
+ else:
202
+
203
+ clean_text = preprocess_plain_text(plain_text)
204
+
205
+ summarize = st.button("Summarize")
206
+
207
+ # called on toggle button [summarize]
208
+ if summarize:
209
+ if model_type == "Facebook-Bart":
210
+ if is_url:
211
+ text_to_summarize = chunks
212
+ else:
213
+ text_to_summarize = clean_text
214
+
215
+ with st.spinner(
216
+ text="Loading Facebook-Bart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
217
+ ):
218
+ summarizer_model = facebook_model()
219
+ summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
220
+ summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
221
+
222
+ elif model_type == "Sshleifer-DistilBart":
223
+ if is_url:
224
+ text_to_summarize = chunks
225
+ else:
226
+ text_to_summarize = clean_text
227
+
228
+ with st.spinner(
229
+ text="Loading Sshleifer-DistilBart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
230
+ ):
231
+ summarizer_model = schleifer_model()
232
+ summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
233
+ summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
234
+
235
+ # final summarized output
236
+ st.subheader("Summarized text")
237
+
238
+ if is_url:
239
+
240
+ # view summarized text (expander)
241
+ st.markdown(f"Article title: {article_title}")
242
+
243
+ st.write(summarized_text)
244
+
245
+ summary_downloader(summarized_text)
246
+
247
+
248
+ # In[ ]:
249
+
250
+
251
+
252
+