Spaces:

ml6team
/

distilbart-tos-summarizer-tosdr

Build error

App Files Files Community

sdhanabal1 commited on Jan 27, 2022

Commit

abcaca9

•

1 Parent(s): 8d4dd5e

Refactor and highlight extract summary

Browse files

Files changed (4) hide show

.gitignore +1 -0
Summarizer.py +45 -0
app.py +46 -58
requirements.txt +2 -1

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 .idea/
 venv/

 .idea/
 venv/
+__pycache__/

Summarizer.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from textwrap import wrap
+from sumy.parsers import DocumentParser
+from sumy.parsers.html import HtmlParser
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.nlp.stemmers import Stemmer
+from sumy.summarizers.lsa import LsaSummarizer
+from sumy.utils import get_stop_words
+from transformers import Pipeline
+class Summarizer:
+    DEFAULT_LANGUAGE = "english"
+    def __init__(self, pipeline: Pipeline):
+        self.pipeline = pipeline
+        stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
+        self.lsa_summarizer = LsaSummarizer(stemmer)
+        self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
+    @staticmethod
+    def sentence_list(summarized_sentences) -> list:
+        summarized_list = []
+        for sentence in summarized_sentences:
+            summarized_list.append(sentence._text)
+        return summarized_list
+    def __extractive_summary(self, parser: DocumentParser, sentences_count):
+        summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
+        summarized_list = Summarizer.sentence_list(summarized_sentences)
+        all_sentences_list = Summarizer.sentence_list(parser.document.sentences)
+        return all_sentences_list, summarized_list
+    def extractive_summary_from_text(self, text: str, sentences_count: int) -> (list, list):
+        parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
+        return self.__extractive_summary(parser, sentences_count)
+    def extractive_summary_from_url(self, url: str, sentences_count: int) -> (list, list):
+        parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
+        return self.__extractive_summary(parser, sentences_count)
+    def abstractive_summary(self, summary: str) -> str:
+        summary_text = " ".join([result['summary_text'] for result in self.pipeline(wrap(summary, 2048))])
+        return summary_text

app.py CHANGED Viewed

@@ -1,21 +1,14 @@
-from textwrap import wrap
-from transformers import pipeline
 import streamlit as st
-from sumy.parsers.plaintext import PlaintextParser
-from sumy.nlp.tokenizers import Tokenizer
-from sumy.nlp.stemmers import Stemmer
-from sumy.summarizers.lsa import LsaSummarizer
-from sumy.utils import get_stop_words
-import nltk
 nltk.download('punkt')
-DEFAULT_LANGUAGE = "english"
 DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
-stemmer = Stemmer(DEFAULT_LANGUAGE)
-lsa_summarizer = LsaSummarizer(stemmer)
-lsa_summarizer.stop_words = get_stop_words(language=DEFAULT_LANGUAGE)
 st.markdown('# Terms & conditions abstractive summarization model :pencil:')
 st.write('This app provides the abstract summary of the provided terms & conditions. '
@@ -25,14 +18,14 @@ st.write('Information about the model :point_right: https://huggingface.co/ml6te
 st.markdown("""
 To use this:
 - Number of sentences to be extracted is configurable
-- Copy terms & conditions and hit 'Summarize'
 """)
 @st.cache(allow_output_mutation=True,
           suppress_st_warning=True,
           show_spinner=False)
-def load_model():
     with st.spinner('Please wait for the model to load...'):
         terms_and_conditions_pipeline = pipeline(
             task='summarization',
@@ -42,15 +35,40 @@ def load_model():
     return terms_and_conditions_pipeline
-tc_pipeline = load_model()
 if 'tc_text' not in st.session_state:
-    st.session_state['tc_text'] = ""
 if 'sentences_length' not in st.session_state:
     st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
 st.header("Input")
 with st.form(key='terms-and-conditions'):
     sentences_length_input = st.number_input(
         label='Number of sentences to be extracted:',
@@ -59,53 +77,23 @@ with st.form(key='terms-and-conditions'):
     )
     tc_text_input = st.text_area(
         value=st.session_state.tc_text,
-        label='Terms & conditions text:',
-        height=240
-    )
-    submit_button = st.form_submit_button(label='Summarize')
-st.header("Output")
-def generate_abstractive_summary(summary) -> str:
-    summary_text = " ".join([result['summary_text'] for result in tc_pipeline(wrap(summary, 2048))])
-    return summary_text
-def generate_extractive_summary(text, sentences_count: int) -> str:
-    parser = PlaintextParser.from_string(text, Tokenizer(DEFAULT_LANGUAGE))
-    summarized_sentences = lsa_summarizer(parser.document, sentences_count)
-    summarized_text = " ".join([sentence._text for sentence in summarized_sentences])
-    return summarized_text
-def display_abstractive_summary(summary) -> None:
-    st.subheader("Abstractive Summary")
-    st.markdown('#####')
-    st.text_area(
-        value=summary,
-        label='',
-        height=240
-    )
-def display_extractive_summary(summary) -> None:
-    st.subheader("Extractive Summary")
-    st.markdown('#####')
-    st.text_area(
-        value=summary,
-        label='',
         height=240
     )
 if submit_button:
-    tc_text = tc_text_input
-    sentences_length = sentences_length_input
-    extract_summary = generate_extractive_summary(tc_text, sentences_length)
-    abstract_summary = generate_abstractive_summary(extract_summary)
-    display_extractive_summary(extract_summary)
-    display_abstractive_summary(abstract_summary)

+import nltk
 import streamlit as st
+import validators
+from transformers import pipeline
+from validators import ValidationFailure
+from Summarizer import Summarizer
 nltk.download('punkt')
 DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
 st.markdown('# Terms & conditions abstractive summarization model :pencil:')
 st.write('This app provides the abstract summary of the provided terms & conditions. '
 st.markdown("""
 To use this:
 - Number of sentences to be extracted is configurable
+- Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize'
 """)
 @st.cache(allow_output_mutation=True,
           suppress_st_warning=True,
           show_spinner=False)
+def create_pipeline():
     with st.spinner('Please wait for the model to load...'):
         terms_and_conditions_pipeline = pipeline(
             task='summarization',
     return terms_and_conditions_pipeline
+def display_abstractive_summary(summary) -> None:
+    st.subheader("Abstractive Summary")
+    st.markdown('#####')
+    st.markdown(summary)
+def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
+    st.subheader("Extractive Summary")
+    st.markdown('#####')
+    terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
+    replaced_text = terms_and_conditions
+    for sentence in summary_sentences:
+        replaced_text = replaced_text.replace(sentence, f"<span style='background-color: #FFFF00'>{sentence}</span>")
+    st.write(replaced_text, unsafe_allow_html=True)
+def is_valid_url(url: str) -> bool:
+    result = validators.url(url)
+    if isinstance(result, ValidationFailure):
+        return False
+    return True
+summarizer: Summarizer = Summarizer(create_pipeline())
 if 'tc_text' not in st.session_state:
+    st.session_state['tc_text'] = ''
 if 'sentences_length' not in st.session_state:
     st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
+st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
 st.header("Input")
 with st.form(key='terms-and-conditions'):
     sentences_length_input = st.number_input(
         label='Number of sentences to be extracted:',
     )
     tc_text_input = st.text_area(
         value=st.session_state.tc_text,
+        label='Terms & conditions content or specify an URL:',
         height=240
     )
+    submit_button = st.form_submit_button(label='Summarize')
 if submit_button:
+    if is_valid_url(tc_text_input):
+        (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
+                                                                                            sentences_length_input)
+    else:
+        (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
+                                                                                             sentences_length_input)
+    extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
+    abstract_summary = summarizer.abstractive_summary(extract_summary)
+    display_extractive_summary(all_sentences, extract_summary_sentences)
+    display_abstractive_summary(abstract_summary)

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ torch==1.9.1
 torchvision==0.10.1
 transformers
 sumy==0.9.0
-nltk

 torchvision==0.10.1
 transformers
 sumy==0.9.0
+nltk
+validators