tommasobaldi commited on
Commit
fbe3ac9
1 Parent(s): c4a98a3

add first example of app.py

Browse files
Files changed (2) hide show
  1. Summarizer.py +56 -0
  2. app.py +111 -0
Summarizer.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+
3
+ from sumy.nlp.stemmers import Stemmer
4
+ from sumy.summarizers.lsa import LsaSummarizer
5
+ from sumy.utils import get_stop_words
6
+ from transformers import Pipeline
7
+
8
+ class Summarizer:
9
+ DEFAULT_LANGUAGE = "english"
10
+ DEFAULT_SENTENCE_LENGTH = 15
11
+
12
+ def __init__(self, pipeline: Pipeline):
13
+ self.pipeline = pipeline
14
+ stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
15
+ self.lsa_summarizer = LsaSummarizer(stemmer)
16
+ self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
17
+
18
+ @staticmethod
19
+ def sentence_list(summarized_sentences) -> list:
20
+ summarized_list = []
21
+ for sentence in summarized_sentences:
22
+ summarized_list.append(sentence._text)
23
+ return summarized_list
24
+
25
+ @staticmethod
26
+ def join_sentences(summarized_sentences: list) -> str:
27
+ return " ".join([sentence for sentence in summarized_sentences])
28
+
29
+ @staticmethod
30
+ def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
31
+ accumulated_list = []
32
+ result_list = []
33
+ cumulative_token_length = 0
34
+ for sentence in summary_sentences:
35
+ token_list = [token for token in nltk.word_tokenize(sentence) if token not in ["."]]
36
+ token_length = len(token_list)
37
+ if token_length + cumulative_token_length > split_token_length and result_list:
38
+ accumulated_list.append(Summarizer.join_sentences(result_list))
39
+ result_list = [sentence]
40
+ cumulative_token_length = token_length
41
+ else:
42
+ result_list.append(sentence)
43
+ cumulative_token_length += token_length
44
+
45
+ if result_list:
46
+ accumulated_list.append(Summarizer.join_sentences(result_list))
47
+
48
+ return accumulated_list
49
+
50
+ def abstractive_summary(self, summary_sentences: list) -> list:
51
+ wrapped_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=600)
52
+ summary_list = []
53
+ for result in self.pipeline(wrapped_sentences, min_length=32, max_length=512):
54
+ summary_list.append(result['summary_text'])
55
+
56
+ return summary_list
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html
2
+ import os
3
+ from typing import AnyStr
4
+
5
+ import nltk
6
+ import streamlit as st
7
+ import validators
8
+ from transformers import pipeline
9
+ from validators import ValidationFailure
10
+
11
+ from Summarizer import Summarizer
12
+
13
+
14
+ def main() -> None:
15
+ nltk.download("punkt")
16
+ # header
17
+ st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:")
18
+ st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and "
19
+ "difficult to understand. ")
20
+ st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that "
21
+ "we have prepared for you, then you will see the summary represented as the most important sentences.")
22
+ st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following "
23
+ "GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:")
24
+ st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::")
25
+ st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, "
26
+ "so we suggest a careful reading of the document.")
27
+
28
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False)
29
+ def create_pipeline():
30
+ with st.spinner("Loading the model..."):
31
+ tos_pipeline = pipeline(task="summarization",
32
+ model="ML-unipi/bart-large-tos",
33
+ tokenizer="ML-unipi/bart-large-tos"
34
+ )
35
+ return tos_pipeline
36
+
37
+ def display_summary(summary_sentences: list) -> None:
38
+ st.subheader("Summary :male-detective:")
39
+ for senetence in summary_sentences:
40
+ st.markdown(f"<li>{senetence}</li>", unsafe_allow_html=True)
41
+
42
+ def is_valid_url(url: str) -> bool:
43
+ result = validators.url(url)
44
+ if isinstance(result, ValidationFailure):
45
+ return False
46
+ return True
47
+
48
+ def get_list_files() -> list:
49
+ names = []
50
+ for file in os.listdir("./samples/"):
51
+ if file.endswith(".txt"):
52
+ names.append(file.replace(".txt", ""))
53
+
54
+ return names
55
+
56
+ def fetch_file_content(filename: str) -> AnyStr:
57
+ with open(f"./samples/{filename.lower()}.txt", "r") as file:
58
+ text = file.read()
59
+ return text
60
+
61
+ summarizer: Summarizer = Summarizer(create_pipeline())
62
+
63
+ if "target_text" not in st.session_state:
64
+ st.session_state.target_text = ""
65
+ if "sentence_lenght" not in st.session_state:
66
+ st.session_state.sentence_length = Summarizer.DEFAULT_SENTENCE_LENGTH
67
+ if "sample_choice" not in st.session_state:
68
+ st.session_state.sentence_length = ""
69
+
70
+ st.header("Input")
71
+
72
+ sentences_length = st.number_input(
73
+ label="How many senetences to be extracted:",
74
+ min_value=5,
75
+ max_value=15,
76
+ value=st.session_state.sentence_length
77
+ )
78
+ sample_choice = st.selectbox(
79
+ label="Select a sample:",
80
+ options=get_list_files()
81
+ )
82
+
83
+ st.session_state.target_text = fetch_file_content(sample_choice)
84
+ target_text_input = st.text_area(
85
+ value=st.session_state.target_text,
86
+ label="Paste your own Term Of Service:",
87
+ height=240
88
+ )
89
+
90
+ summarize_button = st.button(label="Try it!")
91
+
92
+ @st.cache(suppress_st_warning=True,
93
+ show_spinner=False,
94
+ allow_output_mutation=True,
95
+ hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
96
+ "tokenizers.Tokenizer": lambda _: None,
97
+ "tokenizers.AddedToken": lambda _: None,
98
+ })
99
+
100
+
101
+ def summary_from_cache(summary_sentence: tuple) -> tuple:
102
+ with st.spinner("Summarizing in progress..."):
103
+ return tuple(summarizer.abstractive_summary(list(summary_sentence)))
104
+
105
+ if summarize_button:
106
+ output = pipeline(st.session_state.target_text)
107
+ output(output[0])
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()