Spaces:

ml6team
/

distilbart-tos-summarizer-tosdr

Build error

App Files Files Community

distilbart-tos-summarizer-tosdr / app.py

sdhanabal1

Test html rendering error

3738b3d almost 3 years ago

raw

history blame

5.96 kB

	import html
	import os
	from typing import AnyStr

	import nltk
	import streamlit as st
	import validators
	from transformers import pipeline
	from validators import ValidationFailure

	from Summarizer import Summarizer


	def main() -> None:
	nltk.download('punkt')

	st.markdown('# Terms & Conditions Summarizer :pencil:')
	st.markdown('Do you also always take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up to an app like the responsible citizen that you are? :thinking_face:<br>'
	'No?<br>'
	"Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
	st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
	'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
	'The abstractive summary will give you an idea of what the key message of the document likely is :bulb:', unsafe_allow_html=True)
	st.markdown('<b>Want to find out more?</b> :brain:<br>'
	'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
	'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)

	@st.cache(allow_output_mutation=True,
	suppress_st_warning=True,
	show_spinner=False)
	def create_pipeline():
	with st.spinner('Please wait for the model to load...'):
	terms_and_conditions_pipeline = pipeline(
	task='summarization',
	model='ml6team/distilbart-tos-summarizer-tosdr',
	tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
	)
	return terms_and_conditions_pipeline

	def display_abstractive_summary(summary_sentences: list) -> None:
	st.subheader("Abstractive Summary")
	st.markdown('#####')
	for sentence in summary_sentences:
	st.markdown(f"- {sentence}", unsafe_allow_html=True)

	def display_extractive_summary(terms_and_conditions_text: str, summary_sentences: list) -> None:
	st.subheader("Extractive Summary")
	st.markdown('#####')
	replaced_text = html.escape(terms_and_conditions_text)
	for sentence in summary_sentences:
	escaped_sentence = html.escape(sentence)
	replaced_text = replaced_text.replace(escaped_sentence,
	f"<p>"
	f"<span style='background-color: yellow'>{escaped_sentence}</span>"
	f"</p>")
	replaced_text = replaced_text.replace('\n', '<br/>')
	with st.container():
	st.write(f"<p>{replaced_text}</p>", unsafe_allow_html=True)

	def is_valid_url(url: str) -> bool:
	result = validators.url(url)
	if isinstance(result, ValidationFailure):
	return False
	return True

	def list_all_filenames() -> list:
	filenames = []
	for file in os.listdir('./sample-terms-and-conditions/'):
	if file.endswith('.txt'):
	filenames.append(file.replace('.txt', ''))
	return filenames

	def fetch_file_contents(filename: str) -> AnyStr:
	with open(f'./sample-terms-and-conditions/{filename.lower()}.txt', 'r') as f:
	data = f.read()
	return data

	summarizer: Summarizer = Summarizer(create_pipeline())

	if 'tc_text' not in st.session_state:
	st.session_state['tc_text'] = ''

	if 'sentences_length' not in st.session_state:
	st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH

	if 'sample_choice' not in st.session_state:
	st.session_state['sample_choice'] = ''

	st.header("Input")

	sentences_length = st.number_input(
	label='Number of sentences to be extracted:',
	min_value=5,
	max_value=15,
	value=st.session_state.sentences_length
	)
	sample_choice = st.selectbox(
	'Choose a sample terms & conditions:',
	list_all_filenames())
	st.session_state.tc_text = fetch_file_contents(sample_choice)
	tc_text_input = st.text_area(
	value=st.session_state.tc_text,
	label='Terms & conditions content or specify an URL:',
	height=240
	)

	summarize_button = st.button(label='Summarize')

	@st.cache(suppress_st_warning=True,
	show_spinner=False,
	allow_output_mutation=True,
	hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
	"tokenizers.Tokenizer": lambda _: None,
	"tokenizers.AddedToken": lambda _: None,
	})
	def abstractive_summary_from_cache(summary_sentences: tuple) -> tuple:
	with st.spinner('Summarizing the text is in progress...'):
	return tuple(summarizer.abstractive_summary(list(summary_sentences)))

	if summarize_button:

	if is_valid_url(tc_text_input):
	extract_summary_sentences = summarizer.extractive_summary_from_url(tc_text_input, sentences_length)
	else:
	extract_summary_sentences = summarizer.extractive_summary_from_text(tc_text_input, sentences_length)

	extract_summary_sentences_tuple = tuple(extract_summary_sentences)
	abstract_summary_tuple = abstractive_summary_from_cache(extract_summary_sentences_tuple)
	abstract_summary_list = list(abstract_summary_tuple)

	display_abstractive_summary(abstract_summary_list)
	display_extractive_summary(tc_text_input, extract_summary_sentences)


	if __name__ == "__main__":
	main()