from summarizer import Summarizer from goose3 import Goose from fake_useragent import UserAgent from bs4 import BeautifulSoup from transformers import pipeline import re #from newsplease import NewsPlease import validators import streamlit as st import requests import warnings warnings.filterwarnings("ignore") g = Goose() def article_text_extractor(url: str): '''Extract text from url''' paper = g.extract(url=url) first_sentence = list(filter(None, paper.cleaned_text.split("\n")))[0] text = paper.cleaned_text if "reuters" not in url else "\n".join(list(filter(None, paper.cleaned_text.split("\n")))[:-1]) text = text if "REUTERS" not in first_sentence else "\n".join(list(filter(None, text.split("\n")))[1:]) return text def preprocess_text(x): x = x.encode("ascii", "ignore").decode() # unicode x = re.sub(r"https*\S+", " ", x) # url x = re.sub(r"@\S+", " ", x) # mentions x = re.sub(r"#\S+", " ", x) # hastags x = re.sub(r"\s{2,}", " ", x) # over spaces x = re.sub("[^.,!'?A-Za-z0-9]+", " ", x) # special charachters except .,!? return x @st.cache(allow_output_mutation=True) def extractive_model(): model = Summarizer('distilbert-base-uncased') return model @st.cache(allow_output_mutation=True) def facebook_model(): summarizer = pipeline('summarization',model='facebook/bart-large-cnn') return summarizer @st.cache(allow_output_mutation=True) def model(): if summary_type == "Abstractive": return facebook_model() else: return extractive_model() #Streamlit App st.title("Article Summarizer") summary_type = st.sidebar.selectbox("Summary type", options=["Abstractive", "Extractive"]) st.markdown( "This application aims to make an extractive summary of newspaper articles from the text of the article or the url link of the article. The summary is based on a BERT model.") st.markdown("""An extractive summary is one which extracts the most informative sentences from the article. It will therefore only consist of sentences present in the original text. """) st.markdown("""An abstract summary is a summary which captures the essential ideas of the text. An abstract summary may contain sentences which are not present in the original text.""") st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""") st.markdown( "As input we only ingests Raw text entered in text box or URL of an article to be summarised." ) st.markdown("---") url_text = st.text_input("Please Enter a url here") st.markdown( "

OR

", unsafe_allow_html=True, ) plain_text = st.text_input("Please Paste/Enter plain text here") is_url = validators.url(url_text) if is_url: # complete text clean_text = article_text_extractor(url=url_text) summarize = st.button("Summarize") if summarize: #text_to_summarize = preprocess_text(clean_text) if is_url else preprocess_text(plain_text) text_to_summarize = clean_text if is_url else plain_text with st.spinner(text="Loading Model and creating summary. This might take a few seconds depending on the length of your text..."): model = model() #summarized_text = text_to_summarize if len(text_to_summarize) > 60 else ''.join(model(body, min_length=60)) min_ = min(80,len(text_to_summarize)/5) max_ = min(400,len(text_to_summarize)/2) summarized_text = ''.join(model(text_to_summarize, min_length=min_,max_length=max_,num_sentences=3)) if summary_type == "Extractive" else model(text_to_summarize, min_length=min_,max_length=max_)[0]['summary_text'] st.subheader("Original text") st.write(text_to_summarize) st.subheader("Simple Summary") st.write("\n".join(list(filter(None, text_to_summarize.split("\n")))[0:3])) st.subheader("Summarized text with NLP") st.write(summarized_text)