Spaces:

ml6team
/

distilbart-tos-summarizer-tosdr

Build error

App Files Files Community

sdhanabal1 commited on Feb 2, 2022

Commit

4372d93

1 Parent(s): c98407b

Update min length to 24 tokens

Browse files

Files changed (3) hide show

Summarizer.py +6 -9
app.py +1 -1
test_summarizer.py +3 -3

Summarizer.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import string
 import nltk
 from sumy.parsers import DocumentParser
@@ -13,8 +12,7 @@ from transformers import Pipeline
 class Summarizer:
     DEFAULT_LANGUAGE = "english"
-    DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
-    STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
     def __init__(self, pipeline: Pipeline):
         self.pipeline = pipeline
@@ -34,16 +32,15 @@ class Summarizer:
         return " ".join([sentence for sentence in summary_sentences])
     @staticmethod
-    def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
         accumulated_lists = []
         result_list = []
         cumulative_token_length = 0
         for sentence in summary_sentences:
             result_list.append(sentence)
             token_list = nltk.word_tokenize(sentence)
-            token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
-            token_length = len(token_words)
-            if token_length + cumulative_token_length >= max_token_length:
                 accumulated_lists.append(Summarizer.join_sentences(result_list))
                 result_list = []
                 cumulative_token_length = 0
@@ -72,9 +69,9 @@ class Summarizer:
         :return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
         """
         wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
-                                                                       max_token_length=1000)
         # The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
         abstractive_summary_list = []
-        for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
             abstractive_summary_list.append(result['summary_text'])
         return abstractive_summary_list

 import nltk
 from sumy.parsers import DocumentParser
 class Summarizer:
     DEFAULT_LANGUAGE = "english"
+    DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 15
     def __init__(self, pipeline: Pipeline):
         self.pipeline = pipeline
         return " ".join([sentence for sentence in summary_sentences])
     @staticmethod
+    def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
         accumulated_lists = []
         result_list = []
         cumulative_token_length = 0
         for sentence in summary_sentences:
             result_list.append(sentence)
             token_list = nltk.word_tokenize(sentence)
+            token_length = len(token_list)
+            if token_length + cumulative_token_length >= split_token_length:
                 accumulated_lists.append(Summarizer.join_sentences(result_list))
                 result_list = []
                 cumulative_token_length = 0
         :return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
         """
         wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
+                                                                       split_token_length=512)
         # The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
         abstractive_summary_list = []
+        for result in self.pipeline(wrapped_sentences, min_length=24, max_length=512):
             abstractive_summary_list.append(result['summary_text'])
         return abstractive_summary_list

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ def main() -> None:
                 "Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
     st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
                 'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
-                'Now you can just take a quick glanse at the summary and go about the rest of your day assured that no one is abusing your precious personal data  :books:', unsafe_allow_html=True)
     st.markdown('<b>Want to find out more?</b> :brain:<br>'
              'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
              'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)

                 "Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
     st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
                 'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
+                'The abstractive summary will give you an idea of what the key message of the document likely is :bulb:', unsafe_allow_html=True)
     st.markdown('<b>Want to find out more?</b> :brain:<br>'
              'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
              'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)

test_summarizer.py CHANGED Viewed

@@ -8,19 +8,19 @@ def test_split_sentences_by_token_length():
         'Free.'
     ]
-    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=3)
     assert split_sentences == [
         'Python is a programming language.',
         'Memory allocation. Free.'
     ]
-    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=5)
     assert split_sentences == [
         'Python is a programming language. Memory allocation.',
         'Free.'
     ]
-    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=10)
     assert split_sentences == [
         'Python is a programming language. Memory allocation. Free.'
     ]

         'Free.'
     ]
+    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=5)
     assert split_sentences == [
         'Python is a programming language.',
         'Memory allocation. Free.'
     ]
+    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=7)
     assert split_sentences == [
         'Python is a programming language. Memory allocation.',
         'Free.'
     ]
+    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=10)
     assert split_sentences == [
         'Python is a programming language. Memory allocation. Free.'
     ]