Spaces:

kandysh
/

clause_segmentation

Runtime error

App Files Files Community

clause_segmentation / app.py

kandysh

Update app.py

6902e48 about 3 years ago

raw

history blame contribute delete

1.95 kB

	from io import StringIO
	import streamlit as st
	from nltk import Tree
	import stanza
	from annotated_text import annotated_text

	st.set_page_config(layout='wide')
	st.title("Clause segmentation")
	st.info("This bert stanza parser is very slow, please checkout the berkley parser")

	uploaded_file = st.file_uploader("Upload your text file", type="txt")
	nlp = stanza.Pipeline(lang="en", processors='tokenize,pos,constituency',
	package={'constituency': 'wsj_bert'})

	def tree_to_text(tree):
	t = Tree.fromstring(str(tree))
	subtexts = []
	for subtree in t.subtrees():
	if subtree.label() == "S":
	subtexts.append(' '.join(subtree.leaves()))
	for index in range(len(subtexts) - 1):
	subtexts[index] = subtexts[index][0:subtexts[index].index(subtexts[index + 1])]
	for text in subtexts:
	yield text


	def constituency_tree():
	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
	data = stringio.read()
	doc = nlp(data)
	for sent in doc.sentences:
	yield sent.constituency


	def sentence_reader():
	stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
	datas = StringIO.readlines(stringio)
	for data in datas:
	yield data


	def main():
	#colors = ["#9CFAFF", "#B1FBDF", "#C5FDBE", "#DAFE9E", "#EEFF7D"]
	colors = ["#eae4e9", "#fff1e6", "#fde2e4", "#fad2e1", "#e2ece9", "#bee1e6", "#f0efeb", "#dfe7fd", "#cddafd"]
	key = 1
	for tree, text in zip(constituency_tree(), sentence_reader()):
	with st.expander(f"Sentence{key}", expanded=True):
	annotated_list = []
	sub_phrases = tree_to_text(tree)
	for sub_phrase, color in zip(sub_phrases, colors):
	annotated_list.append((sub_phrase,"", color))
	key += 1
	st.write(f'{text}')
	annotated_text(*annotated_list)


	if __name__ == "__main__":
	if uploaded_file is not None:
	main()