sdhanabal1's picture
Add extractive summary information using LSA
8d4dd5e
raw
history blame
3.49 kB
from textwrap import wrap
from transformers import pipeline
import streamlit as st
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
import nltk
nltk.download('punkt')
DEFAULT_LANGUAGE = "english"
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
stemmer = Stemmer(DEFAULT_LANGUAGE)
lsa_summarizer = LsaSummarizer(stemmer)
lsa_summarizer.stop_words = get_stop_words(language=DEFAULT_LANGUAGE)
st.markdown('# Terms & conditions abstractive summarization model :pencil:')
st.write('This app provides the abstract summary of the provided terms & conditions. '
'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization')
st.write('Information about the model :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr')
st.markdown("""
To use this:
- Number of sentences to be extracted is configurable
- Copy terms & conditions and hit 'Summarize'
""")
@st.cache(allow_output_mutation=True,
suppress_st_warning=True,
show_spinner=False)
def load_model():
with st.spinner('Please wait for the model to load...'):
terms_and_conditions_pipeline = pipeline(
task='summarization',
model='ml6team/distilbart-tos-summarizer-tosdr',
tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
)
return terms_and_conditions_pipeline
tc_pipeline = load_model()
if 'tc_text' not in st.session_state:
st.session_state['tc_text'] = ""
if 'sentences_length' not in st.session_state:
st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
st.header("Input")
with st.form(key='terms-and-conditions'):
sentences_length_input = st.number_input(
label='Number of sentences to be extracted:',
min_value=1,
value=st.session_state.sentences_length
)
tc_text_input = st.text_area(
value=st.session_state.tc_text,
label='Terms & conditions text:',
height=240
)
submit_button = st.form_submit_button(label='Summarize')
st.header("Output")
def generate_abstractive_summary(summary) -> str:
summary_text = " ".join([result['summary_text'] for result in tc_pipeline(wrap(summary, 2048))])
return summary_text
def generate_extractive_summary(text, sentences_count: int) -> str:
parser = PlaintextParser.from_string(text, Tokenizer(DEFAULT_LANGUAGE))
summarized_sentences = lsa_summarizer(parser.document, sentences_count)
summarized_text = " ".join([sentence._text for sentence in summarized_sentences])
return summarized_text
def display_abstractive_summary(summary) -> None:
st.subheader("Abstractive Summary")
st.markdown('#####')
st.text_area(
value=summary,
label='',
height=240
)
def display_extractive_summary(summary) -> None:
st.subheader("Extractive Summary")
st.markdown('#####')
st.text_area(
value=summary,
label='',
height=240
)
if submit_button:
tc_text = tc_text_input
sentences_length = sentences_length_input
extract_summary = generate_extractive_summary(tc_text, sentences_length)
abstract_summary = generate_abstractive_summary(extract_summary)
display_extractive_summary(extract_summary)
display_abstractive_summary(abstract_summary)