Spaces:

ML-unipi
/

TermsOfServiceSummarization

Sleeping

App Files Files Community

tommasobaldi commited on Aug 28, 2022

Commit

fbe3ac9

•

1 Parent(s): c4a98a3

add first example of app.py

Browse files

Files changed (2) hide show

Summarizer.py +56 -0
app.py +111 -0

Summarizer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import nltk
+from sumy.nlp.stemmers import Stemmer
+from sumy.summarizers.lsa import LsaSummarizer
+from sumy.utils import get_stop_words
+from transformers import Pipeline
+class Summarizer:
+    DEFAULT_LANGUAGE = "english"
+    DEFAULT_SENTENCE_LENGTH = 15
+    def __init__(self, pipeline: Pipeline):
+        self.pipeline = pipeline
+        stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
+        self.lsa_summarizer = LsaSummarizer(stemmer)
+        self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
+    @staticmethod
+    def sentence_list(summarized_sentences) -> list:
+        summarized_list = []
+        for sentence in summarized_sentences:
+            summarized_list.append(sentence._text)
+        return summarized_list
+    @staticmethod
+    def join_sentences(summarized_sentences: list) -> str:
+        return " ".join([sentence for sentence in summarized_sentences])
+    @staticmethod
+    def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
+        accumulated_list = []
+        result_list = []
+        cumulative_token_length = 0
+        for sentence in summary_sentences:
+            token_list = [token for token in nltk.word_tokenize(sentence) if token not in ["."]]
+            token_length = len(token_list)
+            if token_length + cumulative_token_length > split_token_length and result_list:
+                accumulated_list.append(Summarizer.join_sentences(result_list))
+                result_list = [sentence]
+                cumulative_token_length = token_length
+            else:
+                result_list.append(sentence)
+                cumulative_token_length += token_length
+        if result_list:
+            accumulated_list.append(Summarizer.join_sentences(result_list))
+        return accumulated_list
+    def abstractive_summary(self, summary_sentences: list) -> list:
+        wrapped_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=600)
+        summary_list = []
+        for result in self.pipeline(wrapped_sentences, min_length=32, max_length=512):
+            summary_list.append(result['summary_text'])
+        return summary_list

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import html
+import os
+from typing import AnyStr
+import nltk
+import streamlit as st
+import validators
+from transformers import pipeline
+from validators import ValidationFailure
+from Summarizer import Summarizer
+def main() -> None:
+    nltk.download("punkt")
+    # header
+    st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:")
+    st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and "
+                "difficult to understand. ")
+    st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that "
+                "we have prepared for you, then you will see the summary represented as the most important sentences.")
+    st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following "
+                "GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:")
+    st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::")
+    st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, "
+                "so we suggest a careful reading of the document.")
+    @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False)
+    def create_pipeline():
+        with st.spinner("Loading the model..."):
+            tos_pipeline = pipeline(task="summarization",
+                                    model="ML-unipi/bart-large-tos",
+                                    tokenizer="ML-unipi/bart-large-tos"
+                                    )
+        return tos_pipeline
+    def display_summary(summary_sentences: list) -> None:
+        st.subheader("Summary :male-detective:")
+        for senetence in summary_sentences:
+            st.markdown(f"<li>{senetence}</li>", unsafe_allow_html=True)
+    def is_valid_url(url: str) -> bool:
+        result = validators.url(url)
+        if isinstance(result, ValidationFailure):
+            return False
+        return True
+    def get_list_files() -> list:
+        names = []
+        for file in os.listdir("./samples/"):
+            if file.endswith(".txt"):
+                names.append(file.replace(".txt", ""))
+        return names
+    def fetch_file_content(filename: str) -> AnyStr:
+        with open(f"./samples/{filename.lower()}.txt", "r") as file:
+            text = file.read()
+        return text
+    summarizer: Summarizer = Summarizer(create_pipeline())
+    if "target_text" not in st.session_state:
+        st.session_state.target_text = ""
+    if "sentence_lenght" not in st.session_state:
+        st.session_state.sentence_length = Summarizer.DEFAULT_SENTENCE_LENGTH
+    if "sample_choice" not in st.session_state:
+        st.session_state.sentence_length = ""
+    st.header("Input")
+    sentences_length = st.number_input(
+        label="How many senetences to be extracted:",
+        min_value=5,
+        max_value=15,
+        value=st.session_state.sentence_length
+    )
+    sample_choice = st.selectbox(
+        label="Select a sample:",
+        options=get_list_files()
+    )
+    st.session_state.target_text = fetch_file_content(sample_choice)
+    target_text_input = st.text_area(
+        value=st.session_state.target_text,
+        label="Paste your own Term Of Service:",
+        height=240
+    )
+    summarize_button = st.button(label="Try it!")
+    @st.cache(suppress_st_warning=True,
+              show_spinner=False,
+              allow_output_mutation=True,
+              hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
+                          "tokenizers.Tokenizer": lambda _: None,
+                          "tokenizers.AddedToken": lambda _: None,
+                          })
+    def summary_from_cache(summary_sentence: tuple) -> tuple:
+        with st.spinner("Summarizing in progress..."):
+            return tuple(summarizer.abstractive_summary(list(summary_sentence)))
+    if summarize_button:
+        output = pipeline(st.session_state.target_text)
+        output(output[0])
+if __name__ == "__main__":
+    main()