Spaces:

yassTrad
/

extractiveSum

Runtime error

App Files Files Community

extractiveSum / app.py

yassTrad

Update app.py

06a5293 about 2 years ago

raw history blame contribute delete

No virus

3.9 kB

	from summarizer import Summarizer
	from goose3 import Goose
	from fake_useragent import UserAgent
	from bs4 import BeautifulSoup
	from transformers import pipeline
	import re
	#from newsplease import NewsPlease
	import validators
	import streamlit as st
	import requests
	import warnings
	warnings.filterwarnings("ignore")

	g = Goose()

	def article_text_extractor(url: str):
	'''Extract text from url'''
	paper = g.extract(url=url)
	first_sentence = list(filter(None, paper.cleaned_text.split("\n")))[0]
	text = paper.cleaned_text if "reuters" not in url else "\n".join(list(filter(None, paper.cleaned_text.split("\n")))[:-1])
	text = text if "REUTERS" not in first_sentence else "\n".join(list(filter(None, text.split("\n")))[1:])
	return text


	def preprocess_text(x):
	x = x.encode("ascii", "ignore").decode() # unicode
	x = re.sub(r"https*\S+", " ", x) # url
	x = re.sub(r"@\S+", " ", x) # mentions
	x = re.sub(r"#\S+", " ", x) # hastags
	x = re.sub(r"\s{2,}", " ", x) # over spaces
	x = re.sub("[^.,!'?A-Za-z0-9]+", " ", x) # special charachters except .,!?

	return x

	@st.cache(allow_output_mutation=True)
	def extractive_model():
	model = Summarizer('distilbert-base-uncased')
	return model

	@st.cache(allow_output_mutation=True)
	def facebook_model():
	summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
	return summarizer

	@st.cache(allow_output_mutation=True)
	def model():
	if summary_type == "Abstractive":
	return facebook_model()
	else:
	return extractive_model()


	#Streamlit App

	st.title("Article Summarizer")

	summary_type = st.sidebar.selectbox("Summary type", options=["Abstractive", "Extractive"])

	st.markdown(
	"This application aims to make an extractive summary of newspaper articles from the text of the article or the url link of the article. The summary is based on a BERT model.")

	st.markdown("""An extractive summary is one which extracts the most informative sentences from the article. It will therefore only consist of sentences present in the original text. """)

	st.markdown("""An abstract summary is a summary which captures the essential ideas of the text. An abstract summary may contain sentences which are not present in the original text.""")

	st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")

	st.markdown(
	"As input we only ingests Raw text entered in text box or URL of an article to be summarised."
	)


	st.markdown("---")

	url_text = st.text_input("Please Enter a url here")

	st.markdown(
	"<h3 style='text-align: center; color: red;'>OR</h3>",
	unsafe_allow_html=True,
	)

	plain_text = st.text_input("Please Paste/Enter plain text here")

	is_url = validators.url(url_text)

	if is_url:
	# complete text
	clean_text = article_text_extractor(url=url_text)

	summarize = st.button("Summarize")

	if summarize:
	#text_to_summarize = preprocess_text(clean_text) if is_url else preprocess_text(plain_text)
	text_to_summarize = clean_text if is_url else plain_text


	with st.spinner(text="Loading Model and creating summary. This might take a few seconds depending on the length of your text..."):
	model = model()
	#summarized_text = text_to_summarize if len(text_to_summarize) > 60 else ''.join(model(body, min_length=60))
	min_ = min(80,len(text_to_summarize)/5)
	max_ = min(400,len(text_to_summarize)/2)
	summarized_text = ''.join(model(text_to_summarize, min_length=min_,max_length=max_,num_sentences=3)) if summary_type == "Extractive" else model(text_to_summarize, min_length=min_,max_length=max_)[0]['summary_text']

	st.subheader("Original text")
	st.write(text_to_summarize)

	st.subheader("Simple Summary")
	st.write("\n".join(list(filter(None, text_to_summarize.split("\n")))[0:3]))

	st.subheader("Summarized text with NLP")
	st.write(summarized_text)