Spaces:

AI-Dashboards
/

Topic-Modeling-Clusters-Free-Text

Running

App Files Files Community

Topic-Modeling-Clusters-Free-Text / app.py

awacke1

Create app.py

c4aa562 over 1 year ago

raw

history blame

2.98 kB

	import streamlit as st
	import spacy
	import numpy as np
	from gensim import corpora, models
	from utils import window, get_depths, get_local_maxima, compute_threshold, get_threshold_segments
	from itertools import chain
	from sklearn.preprocessing import MultiLabelBinarizer
	from sklearn.metrics.pairwise import cosine_similarity

	nlp = spacy.load('en_core_web_sm')

	def print_list(lst):
	for e in lst:
	st.markdown("- " + e)

	st.subheader("Topic Modeling with Segmentation")
	uploaded_file = st.file_uploader("choose a text file", type=["txt"])
	if uploaded_file is not None:
	st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')

	st.write("OR")

	input_text = st.text_area(
	label="Enter text separated by newlines",
	value="",
	key="text",
	height=150
	)

	button=st.button('Get Segments')
	if (button==True) and input_text != "":
	texts = input_text.split('\n')
	sents = []
	for text in texts:
	doc = nlp(text)
	for sent in doc.sents:
	sents.append(sent)
	MIN_LENGTH = 3
	tokenized_sents = [[token.lemma_.lower() for token in sent if
	not token.is_stop and not token.is_punct and token.text.strip() and len(token) >= MIN_LENGTH]
	for sent in sents]
	st.write("Modeling topics:")


	np.random.seed(123)

	N_TOPICS = 5
	N_PASSES = 5

	dictionary = corpora.Dictionary(tokenized_sents)
	bow = [dictionary.doc2bow(sent) for sent in tokenized_sents]
	topic_model = models.LdaModel(corpus=bow, id2word=dictionary, num_topics=N_TOPICS, passes=N_PASSES)
	st.write("inferring topics ...")
	THRESHOLD = 0.05
	doc_topics = list(topic_model.get_document_topics(bow, minimum_probability=THRESHOLD))
	k = 3
	top_k_topics = [[t[0] for t in sorted(sent_topics, key=lambda x: x[1], reverse=True)][:k]
	for sent_topics in doc_topics]
	WINDOW_SIZE = 3
	window_topics = window(top_k_topics, n=WINDOW_SIZE)
	window_topics = [list(set(chain.from_iterable(window))) for window in window_topics]

	binarizer = MultiLabelBinarizer(classes=range(N_TOPICS))

	encoded_topic = binarizer.fit_transform(window_topics)
	st.write("generating segments ...")
	sims_topic = [cosine_similarity([pair[0]], [pair[1]])[0][0] for pair in zip(encoded_topic, encoded_topic[1:])]
	depths_topic = get_depths(sims_topic)
	filtered_topic = get_local_maxima(depths_topic, order=1)
	threshold_topic = compute_threshold(filtered_topic)
	threshold_segments_topic = get_threshold_segments(filtered_topic, threshold_topic)

	segment_ids = threshold_segments_topic + WINDOW_SIZE

	segment_ids = [0] + segment_ids.tolist() + [len(sents)]
	slices = list(zip(segment_ids[:-1], segment_ids[1:]))

	segmented = [sents[s[0]: s[1]] for s in slices]

	for segment in segmented[:-1]:
	print_list([s.text for s in segment])
	st.markdown("""---""")

	print_list([s.text for s in segmented[-1]])