from io import StringIO import streamlit as st from nltk import Tree import stanza from annotated_text import annotated_text st.set_page_config(layout='wide') st.title("Clause segmentation") st.info("This bert stanza parser is very slow, please checkout the berkley parser") uploaded_file = st.file_uploader("Upload your text file", type="txt") nlp = stanza.Pipeline(lang="en", processors='tokenize,pos,constituency', package={'constituency': 'wsj_bert'}) def tree_to_text(tree): t = Tree.fromstring(str(tree)) subtexts = [] for subtree in t.subtrees(): if subtree.label() == "S": subtexts.append(' '.join(subtree.leaves())) for index in range(len(subtexts) - 1): subtexts[index] = subtexts[index][0:subtexts[index].index(subtexts[index + 1])] for text in subtexts: yield text def constituency_tree(): stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) data = stringio.read() doc = nlp(data) for sent in doc.sentences: yield sent.constituency def sentence_reader(): stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) datas = StringIO.readlines(stringio) for data in datas: yield data def main(): #colors = ["#9CFAFF", "#B1FBDF", "#C5FDBE", "#DAFE9E", "#EEFF7D"] colors = ["#eae4e9", "#fff1e6", "#fde2e4", "#fad2e1", "#e2ece9", "#bee1e6", "#f0efeb", "#dfe7fd", "#cddafd"] key = 1 for tree, text in zip(constituency_tree(), sentence_reader()): with st.expander(f"Sentence{key}", expanded=True): annotated_list = [] sub_phrases = tree_to_text(tree) for sub_phrase, color in zip(sub_phrases, colors): annotated_list.append((sub_phrase,"", color)) key += 1 st.write(f'{text}') annotated_text(*annotated_list) if __name__ == "__main__": if uploaded_file is not None: main()