awacke1 commited on
Commit
c4aa562
1 Parent(s): bdb3498

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ import numpy as np
4
+ from gensim import corpora, models
5
+ from utils import window, get_depths, get_local_maxima, compute_threshold, get_threshold_segments
6
+ from itertools import chain
7
+ from sklearn.preprocessing import MultiLabelBinarizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ nlp = spacy.load('en_core_web_sm')
11
+
12
+ def print_list(lst):
13
+ for e in lst:
14
+ st.markdown("- " + e)
15
+
16
+ st.subheader("Topic Modeling with Segmentation")
17
+ uploaded_file = st.file_uploader("choose a text file", type=["txt"])
18
+ if uploaded_file is not None:
19
+ st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')
20
+
21
+ st.write("OR")
22
+
23
+ input_text = st.text_area(
24
+ label="Enter text separated by newlines",
25
+ value="",
26
+ key="text",
27
+ height=150
28
+ )
29
+
30
+ button=st.button('Get Segments')
31
+ if (button==True) and input_text != "":
32
+ texts = input_text.split('\n')
33
+ sents = []
34
+ for text in texts:
35
+ doc = nlp(text)
36
+ for sent in doc.sents:
37
+ sents.append(sent)
38
+ MIN_LENGTH = 3
39
+ tokenized_sents = [[token.lemma_.lower() for token in sent if
40
+ not token.is_stop and not token.is_punct and token.text.strip() and len(token) >= MIN_LENGTH]
41
+ for sent in sents]
42
+ st.write("Modeling topics:")
43
+
44
+
45
+ np.random.seed(123)
46
+
47
+ N_TOPICS = 5
48
+ N_PASSES = 5
49
+
50
+ dictionary = corpora.Dictionary(tokenized_sents)
51
+ bow = [dictionary.doc2bow(sent) for sent in tokenized_sents]
52
+ topic_model = models.LdaModel(corpus=bow, id2word=dictionary, num_topics=N_TOPICS, passes=N_PASSES)
53
+ st.write("inferring topics ...")
54
+ THRESHOLD = 0.05
55
+ doc_topics = list(topic_model.get_document_topics(bow, minimum_probability=THRESHOLD))
56
+ k = 3
57
+ top_k_topics = [[t[0] for t in sorted(sent_topics, key=lambda x: x[1], reverse=True)][:k]
58
+ for sent_topics in doc_topics]
59
+ WINDOW_SIZE = 3
60
+ window_topics = window(top_k_topics, n=WINDOW_SIZE)
61
+ window_topics = [list(set(chain.from_iterable(window))) for window in window_topics]
62
+
63
+ binarizer = MultiLabelBinarizer(classes=range(N_TOPICS))
64
+
65
+ encoded_topic = binarizer.fit_transform(window_topics)
66
+ st.write("generating segments ...")
67
+ sims_topic = [cosine_similarity([pair[0]], [pair[1]])[0][0] for pair in zip(encoded_topic, encoded_topic[1:])]
68
+ depths_topic = get_depths(sims_topic)
69
+ filtered_topic = get_local_maxima(depths_topic, order=1)
70
+ threshold_topic = compute_threshold(filtered_topic)
71
+ threshold_segments_topic = get_threshold_segments(filtered_topic, threshold_topic)
72
+
73
+ segment_ids = threshold_segments_topic + WINDOW_SIZE
74
+
75
+ segment_ids = [0] + segment_ids.tolist() + [len(sents)]
76
+ slices = list(zip(segment_ids[:-1], segment_ids[1:]))
77
+
78
+ segmented = [sents[s[0]: s[1]] for s in slices]
79
+
80
+ for segment in segmented[:-1]:
81
+ print_list([s.text for s in segment])
82
+ st.markdown("""---""")
83
+
84
+ print_list([s.text for s in segmented[-1]])