Spaces:
Build error
Build error
from textwrap import wrap | |
from transformers import pipeline | |
import streamlit as st | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.nlp.stemmers import Stemmer | |
from sumy.summarizers.lsa import LsaSummarizer | |
from sumy.utils import get_stop_words | |
import nltk | |
nltk.download('punkt') | |
DEFAULT_LANGUAGE = "english" | |
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10 | |
stemmer = Stemmer(DEFAULT_LANGUAGE) | |
lsa_summarizer = LsaSummarizer(stemmer) | |
lsa_summarizer.stop_words = get_stop_words(language=DEFAULT_LANGUAGE) | |
st.markdown('# Terms & conditions abstractive summarization model :pencil:') | |
st.write('This app provides the abstract summary of the provided terms & conditions. ' | |
'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization') | |
st.write('Information about the model :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr') | |
st.markdown(""" | |
To use this: | |
- Number of sentences to be extracted is configurable | |
- Copy terms & conditions and hit 'Summarize' | |
""") | |
def load_model(): | |
with st.spinner('Please wait for the model to load...'): | |
terms_and_conditions_pipeline = pipeline( | |
task='summarization', | |
model='ml6team/distilbart-tos-summarizer-tosdr', | |
tokenizer='ml6team/distilbart-tos-summarizer-tosdr' | |
) | |
return terms_and_conditions_pipeline | |
tc_pipeline = load_model() | |
if 'tc_text' not in st.session_state: | |
st.session_state['tc_text'] = "" | |
if 'sentences_length' not in st.session_state: | |
st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH | |
st.header("Input") | |
with st.form(key='terms-and-conditions'): | |
sentences_length_input = st.number_input( | |
label='Number of sentences to be extracted:', | |
min_value=1, | |
value=st.session_state.sentences_length | |
) | |
tc_text_input = st.text_area( | |
value=st.session_state.tc_text, | |
label='Terms & conditions text:', | |
height=240 | |
) | |
submit_button = st.form_submit_button(label='Summarize') | |
st.header("Output") | |
def generate_abstractive_summary(summary) -> str: | |
summary_text = " ".join([result['summary_text'] for result in tc_pipeline(wrap(summary, 2048))]) | |
return summary_text | |
def generate_extractive_summary(text, sentences_count: int) -> str: | |
parser = PlaintextParser.from_string(text, Tokenizer(DEFAULT_LANGUAGE)) | |
summarized_sentences = lsa_summarizer(parser.document, sentences_count) | |
summarized_text = " ".join([sentence._text for sentence in summarized_sentences]) | |
return summarized_text | |
def display_abstractive_summary(summary) -> None: | |
st.subheader("Abstractive Summary") | |
st.markdown('#####') | |
st.text_area( | |
value=summary, | |
label='', | |
height=240 | |
) | |
def display_extractive_summary(summary) -> None: | |
st.subheader("Extractive Summary") | |
st.markdown('#####') | |
st.text_area( | |
value=summary, | |
label='', | |
height=240 | |
) | |
if submit_button: | |
tc_text = tc_text_input | |
sentences_length = sentences_length_input | |
extract_summary = generate_extractive_summary(tc_text, sentences_length) | |
abstract_summary = generate_abstractive_summary(extract_summary) | |
display_extractive_summary(extract_summary) | |
display_abstractive_summary(abstract_summary) | |