File size: 3,902 Bytes
3dabb3d
3058374
5abd3db
7f4f300
719949c
9433e8d
3058374
3dabb3d
 
5abd3db
3dabb3d
 
 
3058374
 
a8c3652
dd5ac98
 
be2791c
 
 
e029660
dd5ac98
f9fc694
9433e8d
 
 
 
 
 
b39512e
9433e8d
 
3dabb3d
 
f9fc694
387be3c
3dabb3d
 
f9fc694
 
 
 
 
 
 
 
 
 
 
 
 
3dabb3d
 
75c9590
3dabb3d
f9fc694
 
3dabb3d
04271f2
 
 
 
 
3dabb3d
 
 
 
c4031d7
3dabb3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc7e03d
da063ef
dc7e03d
3dabb3d
75c9590
3dabb3d
c4031d7
387be3c
 
2c44dbc
3dabb3d
213a0bb
 
 
6b6d74f
 
 
06a5293
3dabb3d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from summarizer import Summarizer
from goose3 import Goose
from fake_useragent import UserAgent
from bs4 import BeautifulSoup   
from transformers import pipeline
import re
#from newsplease import NewsPlease
import validators
import streamlit as st
import requests
import warnings
warnings.filterwarnings("ignore")

g = Goose()

def article_text_extractor(url: str):
  '''Extract text from url'''
  paper = g.extract(url=url)
  first_sentence = list(filter(None, paper.cleaned_text.split("\n")))[0]
  text = paper.cleaned_text if "reuters" not in url else "\n".join(list(filter(None, paper.cleaned_text.split("\n")))[:-1])
  text = text if "REUTERS" not in first_sentence else "\n".join(list(filter(None, text.split("\n")))[1:])
  return text
  

def preprocess_text(x):
  x = x.encode("ascii", "ignore").decode()  # unicode
  x = re.sub(r"https*\S+", " ", x)  # url
  x = re.sub(r"@\S+", " ", x)  # mentions
  x = re.sub(r"#\S+", " ", x)  # hastags
  x = re.sub(r"\s{2,}", " ", x)  # over spaces
  x = re.sub("[^.,!'?A-Za-z0-9]+", " ", x)  # special charachters except .,!?

  return x
  
@st.cache(allow_output_mutation=True)
def extractive_model():    
  model = Summarizer('distilbert-base-uncased')
  return model
  
@st.cache(allow_output_mutation=True)
def facebook_model():  
  summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
  return summarizer

@st.cache(allow_output_mutation=True)
def model():
  if summary_type == "Abstractive":
    return facebook_model()
  else:
    return extractive_model()
  
  
#Streamlit App

st.title("Article Summarizer")

summary_type = st.sidebar.selectbox("Summary type", options=["Abstractive", "Extractive"])

st.markdown(
	    "This application aims to make an extractive summary of newspaper articles from the text of the article or the url link of the article. The summary is based on  a BERT model.")
	    
st.markdown("""An extractive summary is one which extracts the most informative sentences from the article. It will therefore only consist of sentences present in the original text. """)

st.markdown("""An abstract summary is a summary which captures the essential ideas of the text. An abstract summary may contain sentences which are not present in the original text.""")
	    
st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")

st.markdown(
    "As input we only ingests Raw text entered in text box or URL of an article to be summarised."
)


st.markdown("---")
	
url_text = st.text_input("Please Enter a url here")

st.markdown(
	    "<h3 style='text-align: center; color: red;'>OR</h3>",
	    unsafe_allow_html=True,
	)

plain_text = st.text_input("Please Paste/Enter plain text here")

is_url = validators.url(url_text)

if is_url:
	    # complete text
  clean_text = article_text_extractor(url=url_text)
  
summarize = st.button("Summarize")

if summarize:
  #text_to_summarize = preprocess_text(clean_text) if is_url else preprocess_text(plain_text)
  text_to_summarize = clean_text if is_url else plain_text

  
  with st.spinner(text="Loading Model and creating summary. This might take a few seconds depending on the length of your text..."):
    model =  model()
    #summarized_text = text_to_summarize if len(text_to_summarize) > 60 else ''.join(model(body, min_length=60))
    min_ = min(80,len(text_to_summarize)/5)
    max_ = min(400,len(text_to_summarize)/2)
    summarized_text = ''.join(model(text_to_summarize, min_length=min_,max_length=max_,num_sentences=3)) if summary_type == "Extractive" else model(text_to_summarize, min_length=min_,max_length=max_)[0]['summary_text']
    
  st.subheader("Original text")
  st.write(text_to_summarize)
  
  st.subheader("Simple Summary")
  st.write("\n".join(list(filter(None, text_to_summarize.split("\n")))[0:3]))
  
  st.subheader("Summarized text with NLP")  
  st.write(summarized_text)