import html import os from typing import AnyStr import nltk import streamlit as st import validators from transformers import pipeline from validators import ValidationFailure def main() -> None: nltk.download("punkt") # header st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:") st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and " "difficult to understand. ") st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that " "we have prepared for you, then you will see the summary represented as the most important sentences.") st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following " "GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:") st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::") st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, " "so we suggest a careful reading of the document.") @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False) def create_pipeline(): with st.spinner("Loading the model..."): tos_pipeline = pipeline(task="summarization", model="ML-unipi/bart-large-tos", tokenizer="ML-unipi/bart-large-tos" ) return tos_pipeline def display_summary(summary_sentences: list) -> None: st.subheader("Summary :male-detective:") for senetence in summary_sentences: st.markdown(f"
  • {senetence}
  • ", unsafe_allow_html=True) def is_valid_url(url: str) -> bool: result = validators.url(url) if isinstance(result, ValidationFailure): return False return True def get_list_files() -> list: names = [] for file in os.listdir("./samples/"): if file.endswith(".txt"): names.append(file.replace(".txt", "")) return names def fetch_file_content(filename: str) -> AnyStr: with open(f"./samples/{filename.lower()}.txt", "r") as file: text = file.read() return text if "target_text" not in st.session_state: st.session_state.target_text = "" if "sentence_lenght" not in st.session_state: st.session_state.sentence_length = 15 if "sample_choice" not in st.session_state: st.session_state.sentence_length = "" st.header("Input") # sentences_length = st.number_input( # label="How many senetences to be extracted:", # min_value=5, # max_value=15, # step=1, # value=st.session_state.sentence_length # ) sample_choice = st.selectbox( label="Select a sample:", options=get_list_files() ) st.session_state.target_text = fetch_file_content(sample_choice) target_text_input = st.text_area( value=st.session_state.target_text, label="Paste your own Term Of Service:", height=240 ) summarize_button = st.button(label="Try it!") # @st.cache(suppress_st_warning=True, # show_spinner=False, # allow_output_mutation=True, # hash_funcs={"torch.nn.parameter.Parameter": lambda _: None, # "tokenizers.Tokenizer": lambda _: None, # "tokenizers.AddedToken": lambda _: None, # } # ) # def summary_from_cache(summary_sentence: tuple) -> tuple: # with st.spinner("Summarizing in progress..."): # return tuple(summarizer.abstractive_summary(list(summary_sentence))) if summarize_button: output = pipeline(st.session_state.target_text) output(output[0]) if __name__ == "__main__": main()