Spaces:
Runtime error
Runtime error
from summarizer import Summarizer | |
from goose3 import Goose | |
from fake_useragent import UserAgent | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
import re | |
#from newsplease import NewsPlease | |
import validators | |
import streamlit as st | |
import requests | |
import warnings | |
warnings.filterwarnings("ignore") | |
g = Goose() | |
def article_text_extractor(url: str): | |
'''Extract text from url''' | |
paper = g.extract(url=url) | |
first_sentence = list(filter(None, paper.cleaned_text.split("\n")))[0] | |
text = paper.cleaned_text if "reuters" not in url else "\n".join(list(filter(None, paper.cleaned_text.split("\n")))[:-1]) | |
text = text if "REUTERS" not in first_sentence else "\n".join(list(filter(None, text.split("\n")))[1:]) | |
return text | |
def preprocess_text(x): | |
x = x.encode("ascii", "ignore").decode() # unicode | |
x = re.sub(r"https*\S+", " ", x) # url | |
x = re.sub(r"@\S+", " ", x) # mentions | |
x = re.sub(r"#\S+", " ", x) # hastags | |
x = re.sub(r"\s{2,}", " ", x) # over spaces | |
x = re.sub("[^.,!'?A-Za-z0-9]+", " ", x) # special charachters except .,!? | |
return x | |
def extractive_model(): | |
model = Summarizer('distilbert-base-uncased') | |
return model | |
def facebook_model(): | |
summarizer = pipeline('summarization',model='facebook/bart-large-cnn') | |
return summarizer | |
def model(): | |
if summary_type == "Abstractive": | |
return facebook_model() | |
else: | |
return extractive_model() | |
#Streamlit App | |
st.title("Article Summarizer") | |
summary_type = st.sidebar.selectbox("Summary type", options=["Abstractive", "Extractive"]) | |
st.markdown( | |
"This application aims to make an extractive summary of newspaper articles from the text of the article or the url link of the article. The summary is based on a BERT model.") | |
st.markdown("""An extractive summary is one which extracts the most informative sentences from the article. It will therefore only consist of sentences present in the original text. """) | |
st.markdown("""An abstract summary is a summary which captures the essential ideas of the text. An abstract summary may contain sentences which are not present in the original text.""") | |
st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""") | |
st.markdown( | |
"As input we only ingests Raw text entered in text box or URL of an article to be summarised." | |
) | |
st.markdown("---") | |
url_text = st.text_input("Please Enter a url here") | |
st.markdown( | |
"<h3 style='text-align: center; color: red;'>OR</h3>", | |
unsafe_allow_html=True, | |
) | |
plain_text = st.text_input("Please Paste/Enter plain text here") | |
is_url = validators.url(url_text) | |
if is_url: | |
# complete text | |
clean_text = article_text_extractor(url=url_text) | |
summarize = st.button("Summarize") | |
if summarize: | |
#text_to_summarize = preprocess_text(clean_text) if is_url else preprocess_text(plain_text) | |
text_to_summarize = clean_text if is_url else plain_text | |
with st.spinner(text="Loading Model and creating summary. This might take a few seconds depending on the length of your text..."): | |
model = model() | |
#summarized_text = text_to_summarize if len(text_to_summarize) > 60 else ''.join(model(body, min_length=60)) | |
min_ = min(80,len(text_to_summarize)/5) | |
max_ = min(400,len(text_to_summarize)/2) | |
summarized_text = ''.join(model(text_to_summarize, min_length=min_,max_length=max_,num_sentences=3)) if summary_type == "Extractive" else model(text_to_summarize, min_length=min_,max_length=max_)[0]['summary_text'] | |
st.subheader("Original text") | |
st.write(text_to_summarize) | |
st.subheader("Simple Summary") | |
st.write("\n".join(list(filter(None, text_to_summarize.split("\n")))[0:3])) | |
st.subheader("Summarized text with NLP") | |
st.write(summarized_text) |