Spaces:
Runtime error
Runtime error
File size: 3,902 Bytes
3dabb3d 3058374 5abd3db 7f4f300 719949c 9433e8d 3058374 3dabb3d 5abd3db 3dabb3d 3058374 a8c3652 dd5ac98 be2791c e029660 dd5ac98 f9fc694 9433e8d b39512e 9433e8d 3dabb3d f9fc694 387be3c 3dabb3d f9fc694 3dabb3d 75c9590 3dabb3d f9fc694 3dabb3d 04271f2 3dabb3d c4031d7 3dabb3d dc7e03d da063ef dc7e03d 3dabb3d 75c9590 3dabb3d c4031d7 387be3c 2c44dbc 3dabb3d 213a0bb 6b6d74f 06a5293 3dabb3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
from summarizer import Summarizer
from goose3 import Goose
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from transformers import pipeline
import re
#from newsplease import NewsPlease
import validators
import streamlit as st
import requests
import warnings
warnings.filterwarnings("ignore")
g = Goose()
def article_text_extractor(url: str):
'''Extract text from url'''
paper = g.extract(url=url)
first_sentence = list(filter(None, paper.cleaned_text.split("\n")))[0]
text = paper.cleaned_text if "reuters" not in url else "\n".join(list(filter(None, paper.cleaned_text.split("\n")))[:-1])
text = text if "REUTERS" not in first_sentence else "\n".join(list(filter(None, text.split("\n")))[1:])
return text
def preprocess_text(x):
x = x.encode("ascii", "ignore").decode() # unicode
x = re.sub(r"https*\S+", " ", x) # url
x = re.sub(r"@\S+", " ", x) # mentions
x = re.sub(r"#\S+", " ", x) # hastags
x = re.sub(r"\s{2,}", " ", x) # over spaces
x = re.sub("[^.,!'?A-Za-z0-9]+", " ", x) # special charachters except .,!?
return x
@st.cache(allow_output_mutation=True)
def extractive_model():
model = Summarizer('distilbert-base-uncased')
return model
@st.cache(allow_output_mutation=True)
def facebook_model():
summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
return summarizer
@st.cache(allow_output_mutation=True)
def model():
if summary_type == "Abstractive":
return facebook_model()
else:
return extractive_model()
#Streamlit App
st.title("Article Summarizer")
summary_type = st.sidebar.selectbox("Summary type", options=["Abstractive", "Extractive"])
st.markdown(
"This application aims to make an extractive summary of newspaper articles from the text of the article or the url link of the article. The summary is based on a BERT model.")
st.markdown("""An extractive summary is one which extracts the most informative sentences from the article. It will therefore only consist of sentences present in the original text. """)
st.markdown("""An abstract summary is a summary which captures the essential ideas of the text. An abstract summary may contain sentences which are not present in the original text.""")
st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
st.markdown(
"As input we only ingests Raw text entered in text box or URL of an article to be summarised."
)
st.markdown("---")
url_text = st.text_input("Please Enter a url here")
st.markdown(
"<h3 style='text-align: center; color: red;'>OR</h3>",
unsafe_allow_html=True,
)
plain_text = st.text_input("Please Paste/Enter plain text here")
is_url = validators.url(url_text)
if is_url:
# complete text
clean_text = article_text_extractor(url=url_text)
summarize = st.button("Summarize")
if summarize:
#text_to_summarize = preprocess_text(clean_text) if is_url else preprocess_text(plain_text)
text_to_summarize = clean_text if is_url else plain_text
with st.spinner(text="Loading Model and creating summary. This might take a few seconds depending on the length of your text..."):
model = model()
#summarized_text = text_to_summarize if len(text_to_summarize) > 60 else ''.join(model(body, min_length=60))
min_ = min(80,len(text_to_summarize)/5)
max_ = min(400,len(text_to_summarize)/2)
summarized_text = ''.join(model(text_to_summarize, min_length=min_,max_length=max_,num_sentences=3)) if summary_type == "Extractive" else model(text_to_summarize, min_length=min_,max_length=max_)[0]['summary_text']
st.subheader("Original text")
st.write(text_to_summarize)
st.subheader("Simple Summary")
st.write("\n".join(list(filter(None, text_to_summarize.split("\n")))[0:3]))
st.subheader("Summarized text with NLP")
st.write(summarized_text) |