from io import StringIO
import streamlit as st
from nltk import Tree
import stanza
from annotated_text import annotated_text

st.set_page_config(layout='wide')
st.title("Clause segmentation")
st.info("This bert stanza parser is very slow, please checkout the berkley parser")

uploaded_file = st.file_uploader("Upload your text file", type="txt")
nlp = stanza.Pipeline(lang="en", processors='tokenize,pos,constituency',
                          package={'constituency': 'wsj_bert'})

def tree_to_text(tree):
    t = Tree.fromstring(str(tree))
    subtexts = []
    for subtree in t.subtrees():
        if subtree.label() == "S":
            subtexts.append(' '.join(subtree.leaves()))
    for index in range(len(subtexts) - 1):
        subtexts[index] = subtexts[index][0:subtexts[index].index(subtexts[index + 1])]
    for text in subtexts:
        yield text


def constituency_tree():
    stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
    data = stringio.read()
    doc = nlp(data)
    for sent in doc.sentences:
        yield sent.constituency


def sentence_reader():
    stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
    datas = StringIO.readlines(stringio)
    for data in datas:
        yield data


def main():
    #colors = ["#9CFAFF", "#B1FBDF", "#C5FDBE", "#DAFE9E", "#EEFF7D"]
    colors = ["#eae4e9", "#fff1e6", "#fde2e4", "#fad2e1", "#e2ece9", "#bee1e6", "#f0efeb", "#dfe7fd", "#cddafd"]
    key = 1
    for tree, text in zip(constituency_tree(), sentence_reader()):
        with st.expander(f"Sentence{key}", expanded=True):
            annotated_list = []
            sub_phrases = tree_to_text(tree)
            for sub_phrase, color in zip(sub_phrases, colors):
                annotated_list.append((sub_phrase,"", color))
            key += 1
            st.write(f'{text}')
            annotated_text(*annotated_list)


if __name__ == "__main__":
    if uploaded_file is not None:
        main()