extractiveSum / app.py
yassTrad's picture
Update app.py
06a5293
from summarizer import Summarizer
from goose3 import Goose
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from transformers import pipeline
import re
#from newsplease import NewsPlease
import validators
import streamlit as st
import requests
import warnings
warnings.filterwarnings("ignore")
g = Goose()
def article_text_extractor(url: str):
'''Extract text from url'''
paper = g.extract(url=url)
first_sentence = list(filter(None, paper.cleaned_text.split("\n")))[0]
text = paper.cleaned_text if "reuters" not in url else "\n".join(list(filter(None, paper.cleaned_text.split("\n")))[:-1])
text = text if "REUTERS" not in first_sentence else "\n".join(list(filter(None, text.split("\n")))[1:])
return text
def preprocess_text(x):
x = x.encode("ascii", "ignore").decode() # unicode
x = re.sub(r"https*\S+", " ", x) # url
x = re.sub(r"@\S+", " ", x) # mentions
x = re.sub(r"#\S+", " ", x) # hastags
x = re.sub(r"\s{2,}", " ", x) # over spaces
x = re.sub("[^.,!'?A-Za-z0-9]+", " ", x) # special charachters except .,!?
return x
@st.cache(allow_output_mutation=True)
def extractive_model():
model = Summarizer('distilbert-base-uncased')
return model
@st.cache(allow_output_mutation=True)
def facebook_model():
summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
return summarizer
@st.cache(allow_output_mutation=True)
def model():
if summary_type == "Abstractive":
return facebook_model()
else:
return extractive_model()
#Streamlit App
st.title("Article Summarizer")
summary_type = st.sidebar.selectbox("Summary type", options=["Abstractive", "Extractive"])
st.markdown(
"This application aims to make an extractive summary of newspaper articles from the text of the article or the url link of the article. The summary is based on a BERT model.")
st.markdown("""An extractive summary is one which extracts the most informative sentences from the article. It will therefore only consist of sentences present in the original text. """)
st.markdown("""An abstract summary is a summary which captures the essential ideas of the text. An abstract summary may contain sentences which are not present in the original text.""")
st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
st.markdown(
"As input we only ingests Raw text entered in text box or URL of an article to be summarised."
)
st.markdown("---")
url_text = st.text_input("Please Enter a url here")
st.markdown(
"<h3 style='text-align: center; color: red;'>OR</h3>",
unsafe_allow_html=True,
)
plain_text = st.text_input("Please Paste/Enter plain text here")
is_url = validators.url(url_text)
if is_url:
# complete text
clean_text = article_text_extractor(url=url_text)
summarize = st.button("Summarize")
if summarize:
#text_to_summarize = preprocess_text(clean_text) if is_url else preprocess_text(plain_text)
text_to_summarize = clean_text if is_url else plain_text
with st.spinner(text="Loading Model and creating summary. This might take a few seconds depending on the length of your text..."):
model = model()
#summarized_text = text_to_summarize if len(text_to_summarize) > 60 else ''.join(model(body, min_length=60))
min_ = min(80,len(text_to_summarize)/5)
max_ = min(400,len(text_to_summarize)/2)
summarized_text = ''.join(model(text_to_summarize, min_length=min_,max_length=max_,num_sentences=3)) if summary_type == "Extractive" else model(text_to_summarize, min_length=min_,max_length=max_)[0]['summary_text']
st.subheader("Original text")
st.write(text_to_summarize)
st.subheader("Simple Summary")
st.write("\n".join(list(filter(None, text_to_summarize.split("\n")))[0:3]))
st.subheader("Summarized text with NLP")
st.write(summarized_text)