kandysh's picture
Update app.py
2c51e6b
raw
history blame
No virus
1.78 kB
from io import StringIO
import streamlit as st
from nltk import Tree
import stanza
from annotated_text import annotated_text
st.set_page_config(layout='wide')
st.title("Clause segmentation")
uploaded_file = st.file_uploader("Upload your text file", type="txt")
def tree_to_text(tree):
t = Tree.fromstring(str(tree))
subtexts = []
for subtree in t.subtrees():
if subtree.label() == "S":
subtexts.append(' '.join(subtree.leaves()))
for index in range(len(subtexts) - 1):
subtexts[index] = subtexts[index][0:subtexts[index].index(subtexts[index + 1])]
for text in subtexts:
yield text
def constituency_tree():
nlp = stanza.Pipeline(lang="en", processors='tokenize,pos,constituency',
package={'constituency': 'wsj_bert'})
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
data = stringio.read()
doc = nlp(data)
for sent in doc.sentences:
yield sent.constituency
def sentence_reader():
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
datas = StringIO.readlines(stringio)
for data in datas:
yield data
def main():
colors = ["#a2d2ff", "#bde0fe", "#ffafcc", "#cdb4db", "#fefae0"]
key = 1
for tree, text in zip(constituency_tree(), sentence_reader()):
with st.expander(f"Sentence{key}", expanded=True):
annotated_list = []
sub_phrases = tree_to_text(tree)
for sub_phrase, color in zip(sub_phrases, colors):
annotated_list.append((sub_phrase, color))
key += 1
st.write(f'{text} \n')
annotated_text(*annotated_list)
st.write("\n")
if __name__ == "__main__":
if uploaded_file is not None:
main()