Spaces:
Runtime error
Runtime error
import os | |
from typing import AnyStr | |
import nltk | |
import streamlit as st | |
from transformers import pipeline, AutoTokenizer | |
import re | |
def main() -> None: | |
# header | |
st.title(":bookmark_tabs: Terms Of Service Summarizer :bookmark_tabs:") | |
st.markdown("The app aims to extract the main information from Terms Of Conditions, which are often too long and " | |
"difficult to understand. ") | |
st.markdown("To test it just copy-paste a Terms Of Conditions in the textarea or select one of the examples that " | |
"we have prepared for you, then you will see the summary represented as the most important sentences.") | |
st.markdown("If you want more info in how we built our NLP algorithm check the documentation in the following " | |
"GitHub repo: :point_right: https://github.com/balditommaso/TermsOfServiceSummarization :point_left:") | |
st.markdown(":skull_and_crossbones: NOTE :skull_and_crossbones::") | |
st.markdown("the App is still under development and we do not give any guarantee on the quality of the summaries, " | |
"so we suggest a careful reading of the document.") | |
def create_pipeline(): | |
with st.spinner("Loading the model..."): | |
tos_pipeline = pipeline(task="summarization", | |
model="ML-unipi/bart-large-tos", | |
tokenizer="ML-unipi/bart-large-tos", | |
) | |
return tos_pipeline | |
def clean_summaries(text: str) -> list: | |
result = [] | |
lines = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) | |
for line in lines: | |
if line.find(".") != -1: | |
line = line.replace("..", ".") | |
result.append(line) | |
return result | |
def display_summary(summary_sentences: list) -> None: | |
st.subheader("Summary :male-detective:") | |
for sentence in summary_sentences: | |
st.markdown(f"<li>{sentence}</li>", unsafe_allow_html=True) | |
def get_list_files() -> list: | |
names = [] | |
for file in os.listdir("./samples/"): | |
if file.endswith(".txt"): | |
names.append(file.replace(".txt", "")) | |
return names | |
def fetch_file_content(filename: str) -> AnyStr: | |
with open(f"./samples/{filename.lower()}.txt", "r", encoding="utf-8") as file: | |
text = file.read() | |
return text | |
def join_sentences(sentences: list) -> str: | |
return " ".join([sentence for sentence in sentences]) | |
def split_sentences_by_token_length(sentences: list, split_token_length: int) -> list: | |
accumulated_lists = [] | |
result_list = [] | |
cumulative_token_length = 0 | |
for sentence in sentences: | |
token_list = tokenizer(sentence, max_length=1024, truncation=True) | |
token_length = len(token_list["input_ids"]) | |
if token_length > 10: | |
if token_length + cumulative_token_length > split_token_length and result_list: | |
accumulated_lists.append(join_sentences(result_list)) | |
result_list = [sentence] | |
cumulative_token_length = token_length | |
else: | |
result_list.append(sentence) | |
cumulative_token_length += token_length | |
if result_list: | |
accumulated_lists.append(join_sentences(result_list)) | |
return accumulated_lists | |
nltk.download("punkt") | |
pipe = create_pipeline() | |
tokenizer = AutoTokenizer.from_pretrained("ML-unipi/bart-large-tos") | |
if "target_text" not in st.session_state: | |
st.session_state.target_text = "" | |
if "sample_choice" not in st.session_state: | |
st.session_state.sample_choice = "" | |
st.header("Input") | |
sample_choice = st.selectbox( | |
label="Select a sample:", | |
options=get_list_files() | |
) | |
st.session_state.target_text = fetch_file_content(sample_choice) | |
target_text_input = st.text_area( | |
value=st.session_state.target_text, | |
label="Paste your own Term Of Service:", | |
height=240 | |
) | |
summarize_button = st.button(label="Try it!") | |
if summarize_button: | |
if target_text_input != "": | |
summary_sentences = [] | |
with st.spinner("Summarizing in progress..."): | |
sentences = split_sentences_by_token_length(nltk.sent_tokenize(target_text_input, language="english"), | |
split_token_length=1024 | |
) | |
for sentence in sentences: | |
output = pipe(sentence) | |
summary = output[0]["summary_text"] | |
summary_sentences += clean_summaries(summary) | |
display_summary(summary_sentences) | |
if __name__ == "__main__": | |
main() | |