File size: 4,497 Bytes
c4aa562
 
 
 
 
 
 
aa1d6fa
 
c4aa562
 
 
aa1d6fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4aa562
 
 
 
aa1d6fa
c4aa562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import streamlit as st
import spacy
import numpy as np
from gensim import corpora, models
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import islice
from scipy.signal import argrelmax

nlp = spacy.load('en_core_web_sm')


def window(seq, n=3):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result
        
def get_depths(scores):
   
    def climb(seq, i, mode='left'):

        if mode == 'left':
            while True:
                curr = seq[i]
                if i == 0:
                    return curr
                i = i-1
                if not seq[i] > curr:
                    return curr

        if mode == 'right':
            while True:
                curr = seq[i]
                if i == (len(seq)-1):
                    return curr
                i = i+1
                if not seq[i] > curr:
                    return curr
    
    depths = []
    for i in range(len(scores)):
        score = scores[i]
        l_peak = climb(scores, i, mode='left')
        r_peak = climb(scores, i, mode='right')
        depth = 0.5 * (l_peak + r_peak - (2*score))
        depths.append(depth)
        
    return np.array(depths)


def get_local_maxima(depth_scores, order=1):
    maxima_ids = argrelmax(depth_scores, order=order)[0]
    filtered_scores = np.zeros(len(depth_scores))
    filtered_scores[maxima_ids] = depth_scores[maxima_ids]
    return filtered_scores

def compute_threshold(scores):
    s = scores[np.nonzero(scores)]
    threshold = np.mean(s) - (np.std(s) / 2)
    return threshold

def get_threshold_segments(scores, threshold=0.1):
    segment_ids = np.where(scores >= threshold)[0]
    return segment_ids


def print_list(lst):
    for e in lst:
        st.markdown("- " + e)


st.subheader("Topic Modeling with Segmentation")
uploaded_file = st.file_uploader("choose a text file", type=["txt"])
if uploaded_file is not None: 
    st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')

st.write("OR")

input_text = st.text_area(
    label="Enter text separated by newlines",
    value="",
    key="text",
    height=150
)

button=st.button('Get Segments')
if (button==True) and input_text != "":
    texts = input_text.split('\n')
    sents = []
    for text in texts:
        doc = nlp(text)
        for sent in doc.sents:
            sents.append(sent)
    MIN_LENGTH = 3
    tokenized_sents = [[token.lemma_.lower() for token in sent if 
                        not token.is_stop and not token.is_punct and token.text.strip() and len(token) >= MIN_LENGTH] 
                        for sent in sents]
    st.write("Modeling topics:")


    np.random.seed(123)

    N_TOPICS = 5
    N_PASSES = 5

    dictionary = corpora.Dictionary(tokenized_sents)
    bow = [dictionary.doc2bow(sent) for sent in tokenized_sents]
    topic_model = models.LdaModel(corpus=bow, id2word=dictionary, num_topics=N_TOPICS, passes=N_PASSES)
    st.write("inferring topics ...")
    THRESHOLD = 0.05
    doc_topics = list(topic_model.get_document_topics(bow, minimum_probability=THRESHOLD))
    k = 3
    top_k_topics = [[t[0] for t in sorted(sent_topics, key=lambda x: x[1], reverse=True)][:k] 
                    for sent_topics in doc_topics]
    WINDOW_SIZE = 3
    window_topics = window(top_k_topics, n=WINDOW_SIZE)
    window_topics = [list(set(chain.from_iterable(window))) for window in window_topics]

    binarizer = MultiLabelBinarizer(classes=range(N_TOPICS))

    encoded_topic = binarizer.fit_transform(window_topics)
    st.write("generating segments ...")
    sims_topic = [cosine_similarity([pair[0]], [pair[1]])[0][0] for pair in zip(encoded_topic, encoded_topic[1:])]
    depths_topic = get_depths(sims_topic)
    filtered_topic = get_local_maxima(depths_topic, order=1)
    threshold_topic = compute_threshold(filtered_topic)
    threshold_segments_topic = get_threshold_segments(filtered_topic, threshold_topic)

    segment_ids = threshold_segments_topic + WINDOW_SIZE

    segment_ids = [0] + segment_ids.tolist() + [len(sents)]
    slices = list(zip(segment_ids[:-1], segment_ids[1:]))

    segmented = [sents[s[0]: s[1]] for s in slices]

    for segment in segmented[:-1]:
        print_list([s.text for s in segment])
        st.markdown("""---""")
        
    print_list([s.text for s in segmented[-1]])