File size: 4,979 Bytes
fbe3ac9
 
 
 
f6ab2e2
6c91d37
fbe3ac9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c91d37
fbe3ac9
 
 
cb047cb
 
 
 
 
 
 
 
 
 
fbe3ac9
 
d09a554
42540ef
fbe3ac9
 
 
 
 
 
 
 
 
 
6c91d37
fbe3ac9
 
 
87fbf70
 
69f90b2
87fbf70
 
 
 
f6ab2e2
69f90b2
c1aef33
 
795ee13
 
 
 
 
 
 
 
87fbf70
 
 
6d14e62
6e3a821
9c923b0
f6ab2e2
9c923b0
146d058
 
 
bf8859b
146d058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbe3ac9
6e3a821
146d058
6d14e62
90fd7fd
b9f5a52
6e9dcd9
6d14e62
42780ef
 
cb047cb
42780ef
fbe3ac9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
from typing import AnyStr
import nltk
import streamlit as st
from transformers import pipeline, AutoTokenizer
import re


def main() -> None:
    # header
    st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:")
    st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and "
                "difficult to understand. ")
    st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that "
                "we have prepared for you, then you will see the summary represented as the most important sentences.")
    st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following "
                "GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:")
    st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::")
    st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, "
                "so we suggest a careful reading of the document.")

    @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False)
    def create_pipeline():
        with st.spinner("Loading the model..."):
            tos_pipeline = pipeline(task="summarization",
                                    model="ML-unipi/bart-large-tos",
                                    tokenizer="ML-unipi/bart-large-tos",
                                    )
        return tos_pipeline

    def clean_summaries(text: str) -> list:
        result = []
        lines = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        for line in lines:
            if line.find(".") != -1:
                line = line.replace("..", ".")
                result.append(line)
        return result


    def display_summary(summary_sentences: list) -> None:
        st.subheader("Summary :male-detective:")
        for sentence in summary_sentences:
            st.markdown(f"<li>{sentence}</li>", unsafe_allow_html=True)

    def get_list_files() -> list:
        names = []
        for file in os.listdir("./samples/"):
            if file.endswith(".txt"):
                names.append(file.replace(".txt", ""))

        return names

    def fetch_file_content(filename: str) -> AnyStr:
        with open(f"./samples/{filename.lower()}.txt", "r", encoding="utf-8") as file:
            text = file.read()
        return text

    def join_sentences(sentences: list) -> str:
        return " ".join([sentence for sentence in sentences])

    def split_sentences_by_token_length(sentences: list, split_token_length: int) -> list:
        accumulated_lists = []
        result_list = []
        cumulative_token_length = 0

        for sentence in sentences:
            token_list = tokenizer(sentence, max_length=1024, truncation=True)
            token_length = len(token_list["input_ids"])
            if token_length > 10:
                if token_length + cumulative_token_length > split_token_length and result_list:
                    accumulated_lists.append(join_sentences(result_list))
                    result_list = [sentence]
                    cumulative_token_length = token_length
                else:
                    result_list.append(sentence)
                    cumulative_token_length += token_length
        if result_list:
            accumulated_lists.append(join_sentences(result_list))
        return accumulated_lists

    nltk.download("punkt")
    pipe = create_pipeline()
    tokenizer = AutoTokenizer.from_pretrained("ML-unipi/bart-large-tos")

    if "target_text" not in st.session_state:
        st.session_state.target_text = ""
    if "sample_choice" not in st.session_state:
        st.session_state.sample_choice = ""

    st.header("Input")
    sample_choice = st.selectbox(
        label="Select a sample:",
        options=get_list_files()
    )

    st.session_state.target_text = fetch_file_content(sample_choice)
    target_text_input = st.text_area(
        value=st.session_state.target_text,
        label="Paste your own Term Of Service:",
        height=240
    )

    summarize_button = st.button(label="Try it!")

    if summarize_button:
        if target_text_input != "":
            summary_sentences = []
            with st.spinner("Summarizing in progress..."):
                sentences = split_sentences_by_token_length(nltk.sent_tokenize(target_text_input, language="english"),
                                                            split_token_length=1024
                                                            )
                for sentence in sentences:
                    output = pipe(sentence)
                    summary = output[0]["summary_text"]
                    summary_sentences += clean_summaries(summary)
                display_summary(summary_sentences)


if __name__ == "__main__":
    main()