Spaces:
Runtime error
Runtime error
First Commit
Browse files- .gitignore +1 -0
- CosmosData/Data.pkl +3 -0
- CosmosData/SciData.pkl +3 -0
- CosmosData/SciTechData.pkl +3 -0
- CosmosData/TechData.pkl +3 -0
- CosmosData/bow_corpus.pkl +3 -0
- CosmosData/dictionary.pkl +3 -0
- CosmosData/preprocessed_scitech.pkl +3 -0
- CosmosData/test_data.pkl +3 -0
- CosmosData/train_data.pkl +3 -0
- app.py +477 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Viz
|
CosmosData/Data.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10d412f2c213194de8fad35f72b2b89a2296791586efc2896cc37ae24d8cfee0
|
3 |
+
size 6097905
|
CosmosData/SciData.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8000870576a832b5687cbc9f79c6bf58d0fc812b86732627753369eacd7c39cd
|
3 |
+
size 12995836
|
CosmosData/SciTechData.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11ac166393d5c6c9a6fc0d5300402df108c73853ebaa3c00654703bc0d485fad
|
3 |
+
size 18207515
|
CosmosData/TechData.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a71736ae1115b557e58472c0e2cf0726ad0b02ddd22daeb30c49c4dca9b7220
|
3 |
+
size 6101253
|
CosmosData/bow_corpus.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:236080fb44c7b688b1939b8463029b082ea37c27c872fe39a3c378395eafea49
|
3 |
+
size 5583422
|
CosmosData/dictionary.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:365560d76985137c95f2007dc40e7f6fef511693ef26b7b1ad1393a9b539143c
|
3 |
+
size 290617
|
CosmosData/preprocessed_scitech.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2dd754280f5543895399e85b3c78aa295a73c8dda23dadbc6db85fb155efa09
|
3 |
+
size 13832946
|
CosmosData/test_data.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bdbc5902b11327dbb9edd8c5c254d5c7aa35e0a6e8eca4f2f67cf5b659169039
|
3 |
+
size 2874718
|
CosmosData/train_data.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53b3018b3d5e26d7e3ca900adf8ade7a05dfe6f143e7b1bc024a642311ad49ba
|
3 |
+
size 11024839
|
app.py
ADDED
@@ -0,0 +1,477 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
import math
|
6 |
+
import gensim
|
7 |
+
import pickle
|
8 |
+
import pyLDAvis
|
9 |
+
import pyLDAvis.gensim_models as gensimvis
|
10 |
+
import plotly.express as px
|
11 |
+
import plotly.graph_objects as go
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import matplotlib.colors as mcolors
|
14 |
+
from bokeh.plotting import figure, output_file, show
|
15 |
+
from bokeh.models import Label
|
16 |
+
from bokeh.io import output_notebook
|
17 |
+
from plotly.subplots import make_subplots
|
18 |
+
from pandasgui import show
|
19 |
+
from sklearn.manifold import TSNE
|
20 |
+
from sklearn.model_selection import train_test_split
|
21 |
+
from gensim.parsing.preprocessing import STOPWORDS
|
22 |
+
from wordcloud import WordCloud
|
23 |
+
|
24 |
+
colors = ['peachpuff','lightskyblue','turquoise','darkorange','purple','olive','lightgreen','darkseagreen','maroon','teal',
|
25 |
+
'deepskyblue','red','mediumblue','indigo','goldenrod','mediumvioletred','pink','beige','rosybrown']
|
26 |
+
|
27 |
+
st.set_page_config(layout="wide")
|
28 |
+
|
29 |
+
st.markdown("<h1 style='font-weight: normal'><b>Topic Model</b>: Science and Technology News</h1>", unsafe_allow_html=True)
|
30 |
+
|
31 |
+
def load_mpmt(site):
|
32 |
+
with open(f'./Models/{site}Models/{site.lower()}_lda_passes_train.pickle', 'rb') as file:
|
33 |
+
model_passes = pickle.load(file)
|
34 |
+
|
35 |
+
with open(f'./Models/{site}Models/{site.lower()}_lda_topics_train.pickle', 'rb') as file:
|
36 |
+
model_topics = pickle.load(file)
|
37 |
+
|
38 |
+
mp_df = pd.DataFrame(model_passes)
|
39 |
+
mp_df = mp_df.transpose()
|
40 |
+
mp_df = mp_df.iloc[0:50]
|
41 |
+
mp_df['coherence'] = mp_df['coherence'].astype(float)
|
42 |
+
|
43 |
+
mt_df = pd.DataFrame(model_topics)
|
44 |
+
mt_df = mt_df.transpose()
|
45 |
+
mt_df = mt_df.iloc[0:50]
|
46 |
+
mt_df['coherence'] = mt_df['coherence'].astype(float)
|
47 |
+
|
48 |
+
return mp_df, mt_df
|
49 |
+
|
50 |
+
def load_ex(site):
|
51 |
+
with open(f'./Models/{site}Models/{site.lower()}_extreme2.pickle', 'rb') as file:
|
52 |
+
model_extreme = pickle.load(file)
|
53 |
+
|
54 |
+
ex_df = pd.DataFrame(model_extreme)
|
55 |
+
ex_df = ex_df.transpose()
|
56 |
+
ex_df['coherence'] = ex_df['coherence'].astype(float)
|
57 |
+
ex_df = ex_df.reset_index()
|
58 |
+
|
59 |
+
best_model = ex_df.iloc[ex_df['coherence'].idxmax()]['model']
|
60 |
+
bow_corpus = ex_df.iloc[ex_df['coherence'].idxmax()]['corpus']
|
61 |
+
dictionary = ex_df.iloc[ex_df['coherence'].idxmax()]['dictionary']
|
62 |
+
|
63 |
+
return ex_df, best_model, bow_corpus, dictionary
|
64 |
+
|
65 |
+
def load_model(site):
|
66 |
+
with open(f'./{site}Data/preprocessed_scitech.pkl', 'rb') as file:
|
67 |
+
processed_series = pickle.load(file)
|
68 |
+
|
69 |
+
return processed_series
|
70 |
+
|
71 |
+
def load_related(site, bow_corpus, highest_top):
|
72 |
+
with open(f"./{site}Data/SciTechData.pkl", "rb") as file:
|
73 |
+
news = pickle.load(file)
|
74 |
+
|
75 |
+
dm_topic = []
|
76 |
+
|
77 |
+
for i, corp in enumerate(bow_corpus):
|
78 |
+
topic_percs = best_model.get_document_topics(corp)
|
79 |
+
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
|
80 |
+
dm_topic.append(dominant_topic)
|
81 |
+
|
82 |
+
news['dominant_topic'] = dm_topic
|
83 |
+
|
84 |
+
return news[news['dominant_topic'] == highest_top]['url'][:10]
|
85 |
+
|
86 |
+
def load_evaluation_graph(data, xlabel, ylabel, title):
|
87 |
+
if (len(data) > 25):
|
88 |
+
fig = px.line(data, x=range(1, len(data)+1), y='coherence', title=title, labels={'x': xlabel, 'y': ylabel})
|
89 |
+
fig.add_hline(y=data['coherence'].max())
|
90 |
+
try:
|
91 |
+
vert_value = int(data['coherence'].idxmax().split('a')[1])
|
92 |
+
except:
|
93 |
+
vert_value = int(data['coherence'].idxmax().split('s')[1])
|
94 |
+
else:
|
95 |
+
fig = px.line(data[::-1], x=range(30, 100, 10), y='coherence', title=title, labels={'x': xlabel, 'y': ylabel})
|
96 |
+
vert_value = int(data.reset_index()['coherence'].idxmax())
|
97 |
+
fig.update_xaxes(range=[30, 90])
|
98 |
+
|
99 |
+
fig.add_vline(x=vert_value)
|
100 |
+
|
101 |
+
return fig, vert_value
|
102 |
+
|
103 |
+
def load_cloud(processed_series):
|
104 |
+
all_words = ''
|
105 |
+
stopwords = set(STOPWORDS)
|
106 |
+
|
107 |
+
for val in processed_series:
|
108 |
+
all_words += ' '.join(val)+' '
|
109 |
+
|
110 |
+
wordcloud = WordCloud(width = 1800, height = 1600,
|
111 |
+
background_color ='white',
|
112 |
+
stopwords = stopwords,
|
113 |
+
min_font_size = 10).generate(all_words)
|
114 |
+
|
115 |
+
# fig = plt.figure(figsize = (8, 8), facecolor = None)
|
116 |
+
# ax = fig.add_axes([2, 2, 10, 10])
|
117 |
+
# ax.imshow(wordcloud)
|
118 |
+
# ax.axis("off")
|
119 |
+
# fig.tight_layout(pad = 0)
|
120 |
+
|
121 |
+
fig = px.imshow(wordcloud)
|
122 |
+
|
123 |
+
return fig
|
124 |
+
|
125 |
+
def load_cloud_each(model, site):
|
126 |
+
if site == 'Popular Science' or site == 'Cosmos Magazine':
|
127 |
+
words = ['u']
|
128 |
+
elif site == 'Discover Magazine':
|
129 |
+
words = ['nt', 'u', 've', 'm', 'll', 'd', 'rofl']
|
130 |
+
|
131 |
+
stopwords = set(STOPWORDS)
|
132 |
+
|
133 |
+
for i in words:
|
134 |
+
stopwords.add(i)
|
135 |
+
|
136 |
+
num_topics = len(model.get_topics())
|
137 |
+
|
138 |
+
topic_top3words = [(i, topic) for i, topics in model.show_topics(formatted=False, num_topics=num_topics) for j, (topic, wt) in enumerate(topics) if j < 3]
|
139 |
+
|
140 |
+
k=0
|
141 |
+
new_list = []
|
142 |
+
new_new_list = []
|
143 |
+
|
144 |
+
j = 0
|
145 |
+
while (j < len(topic_top3words)):
|
146 |
+
i = topic_top3words[j][1]
|
147 |
+
|
148 |
+
if(j == len(topic_top3words)-1):
|
149 |
+
new_new_list.append(new_list)
|
150 |
+
|
151 |
+
if(k<3):
|
152 |
+
j += 1
|
153 |
+
else:
|
154 |
+
new_new_list.append(new_list)
|
155 |
+
new_list = []
|
156 |
+
k = 0
|
157 |
+
continue
|
158 |
+
new_list.append(i)
|
159 |
+
k += 1
|
160 |
+
|
161 |
+
cloud = WordCloud(stopwords=stopwords,
|
162 |
+
background_color='white',
|
163 |
+
width=750,
|
164 |
+
height=750,
|
165 |
+
max_words=10,
|
166 |
+
colormap='tab10',
|
167 |
+
color_func=lambda *args, **kwargs: color_func(*args, **kwargs, n=n, topics=new_new_list[n]),
|
168 |
+
prefer_horizontal=1.0)
|
169 |
+
|
170 |
+
topics = model.show_topics(num_topics=num_topics, formatted=False)
|
171 |
+
|
172 |
+
j = 0
|
173 |
+
n = 0
|
174 |
+
col1, col2, col3, col4, col5 = st.columns(5)
|
175 |
+
|
176 |
+
while n < num_topics:
|
177 |
+
if (j < 5):
|
178 |
+
if (j == 0):
|
179 |
+
col = col1
|
180 |
+
elif (j == 1):
|
181 |
+
col = col2
|
182 |
+
elif (j == 2):
|
183 |
+
col = col3
|
184 |
+
elif (j == 3):
|
185 |
+
col = col4
|
186 |
+
elif (j == 4):
|
187 |
+
col = col5
|
188 |
+
else:
|
189 |
+
j = 0
|
190 |
+
col1, col2, col3, col4, col5 = st.columns(5)
|
191 |
+
continue
|
192 |
+
|
193 |
+
with col:
|
194 |
+
fig = plt.figure(figsize=(1.5,1.5))
|
195 |
+
plt.title('Topic ' + str(n+1), fontdict=dict(size=6))
|
196 |
+
plt.axis('off')
|
197 |
+
topic_words = dict(topics[n][1])
|
198 |
+
cloud.generate_from_frequencies(topic_words, max_font_size=400)
|
199 |
+
plt.imshow(cloud)
|
200 |
+
st.write(fig)
|
201 |
+
|
202 |
+
j += 1
|
203 |
+
n += 1
|
204 |
+
|
205 |
+
def load_LDAvis(model, corpus, dictionary):
|
206 |
+
vis = gensimvis.prepare(model, corpus, dictionary)
|
207 |
+
html_string = pyLDAvis.prepared_data_to_html(vis)
|
208 |
+
|
209 |
+
return html_string
|
210 |
+
|
211 |
+
def load_topic_document_count(best_model, bow_corpus):
|
212 |
+
dm_topic = []
|
213 |
+
|
214 |
+
for i, corp in enumerate(bow_corpus):
|
215 |
+
topic_percs = best_model.get_document_topics(corp)
|
216 |
+
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
|
217 |
+
dm_topic.append(dominant_topic)
|
218 |
+
|
219 |
+
dm_df = pd.DataFrame(dm_topic, columns=['dominant_topic'])
|
220 |
+
|
221 |
+
topic_top3words = [(i, topic) for i, topics in best_model.show_topics(formatted=False, num_topics=-1) for j, (topic, wt) in enumerate(topics) if j < 3]
|
222 |
+
|
223 |
+
df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
|
224 |
+
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', '.join)
|
225 |
+
df_top3words.reset_index(level=0,inplace=True)
|
226 |
+
|
227 |
+
count_df = pd.DataFrame(dm_df.groupby('dominant_topic').dominant_topic.agg('count').to_frame('COUNT').reset_index()['COUNT'])
|
228 |
+
count_df['top3'] = list(df_top3words['words'])
|
229 |
+
|
230 |
+
fig = px.histogram(dm_df,
|
231 |
+
x='dominant_topic',
|
232 |
+
labels={'dominant_topic': 'Dominant topic', 'count': 'Number of Documents'},
|
233 |
+
height=500,
|
234 |
+
width=1400,
|
235 |
+
title='Documents Count by Dominant Topic')
|
236 |
+
fig.update_layout(yaxis_title='Number of Documents', bargap=0.2)
|
237 |
+
fig.update_layout(
|
238 |
+
margin=dict(b=40),
|
239 |
+
xaxis = dict(
|
240 |
+
tickmode = 'array',
|
241 |
+
tickvals = list(range(dm_df['dominant_topic'].max()+1)),
|
242 |
+
ticktext = df_top3words['words']
|
243 |
+
)
|
244 |
+
)
|
245 |
+
|
246 |
+
return fig, count_df[count_df['COUNT'] == count_df['COUNT'].max()]['top3'].values[0], count_df['COUNT'].idxmax()
|
247 |
+
|
248 |
+
def load_document_count(data):
|
249 |
+
doc_len = [len(d) for d in data]
|
250 |
+
|
251 |
+
fifth = round(np.quantile(doc_len, q=0.05))
|
252 |
+
ninefifth = round(np.quantile(doc_len, q=0.95))
|
253 |
+
|
254 |
+
text = "Mean : " + str(round(np.mean(doc_len))) \
|
255 |
+
+ "<br>Median : " + str(round(np.median(doc_len))) \
|
256 |
+
+ "<br>Std dev. : " + str(round(np.std(doc_len))) \
|
257 |
+
+ "<br>5th percentile : " + str(round(np.quantile(doc_len, q=0.05))) \
|
258 |
+
+ "<br>95th percentile : " + str(round(np.quantile(doc_len, q=0.95)))
|
259 |
+
|
260 |
+
fig = px.histogram(doc_len, labels={"value": "Document Word Count"}, height=500, width=1400, title='Distribution of Documents Word Count')
|
261 |
+
fig.add_annotation(x=0.95, xref='paper', y=0.95, yref='paper', text=text, showarrow=False, bgcolor="#F4F4F4", opacity=0.8, borderpad=8, borderwidth=2, bordercolor="#DDDDDD", align='left')
|
262 |
+
fig.update_layout(yaxis_title='Number of Documents', showlegend=False)
|
263 |
+
|
264 |
+
return fig, fifth, ninefifth
|
265 |
+
|
266 |
+
def color_func(word, font_size, position, orientation, font_path, random_state, n, topics):
|
267 |
+
if word in topics:
|
268 |
+
return colors[n]
|
269 |
+
else:
|
270 |
+
return 'lightgrey'
|
271 |
+
|
272 |
+
def load_topic_word_prob(best_model):
|
273 |
+
topic_prob_list = [i[1].split(',') for i in best_model.show_topics(num_topics=-1)]
|
274 |
+
|
275 |
+
prob_list = []
|
276 |
+
words_list = []
|
277 |
+
|
278 |
+
for i in topic_prob_list:
|
279 |
+
num_list = re.findall(r'[\d]*[.][\d]+', *i)
|
280 |
+
conv = [float(j) for j in num_list]
|
281 |
+
prob_list.append(conv)
|
282 |
+
|
283 |
+
words = re.findall(r'"(.*?)"', *i)
|
284 |
+
words_list.append(words)
|
285 |
+
|
286 |
+
def flatten(l):
|
287 |
+
return [item for sublist in l for item in sublist]
|
288 |
+
|
289 |
+
words_list = flatten(words_list)
|
290 |
+
topnum_list = sorted(list(range(best_model.num_topics)) * 10)
|
291 |
+
prob_list = flatten(prob_list)
|
292 |
+
|
293 |
+
data = {
|
294 |
+
"topic": topnum_list,
|
295 |
+
"words": words_list,
|
296 |
+
"probability": prob_list
|
297 |
+
}
|
298 |
+
|
299 |
+
topic_prob = pd.DataFrame(data)
|
300 |
+
new_df = topic_prob.set_index(['topic'])
|
301 |
+
|
302 |
+
rows = math.ceil(best_model.num_topics / 5)
|
303 |
+
|
304 |
+
fig = make_subplots(
|
305 |
+
rows=rows,
|
306 |
+
cols=5,
|
307 |
+
shared_yaxes=True,
|
308 |
+
subplot_titles=[f'Topic {n}' for n in range(1, best_model.num_topics+1)]
|
309 |
+
)
|
310 |
+
|
311 |
+
j = 1
|
312 |
+
n = 0
|
313 |
+
|
314 |
+
for i in range(1, rows+1):
|
315 |
+
for j in range(1, 6):
|
316 |
+
if (n < best_model.num_topics):
|
317 |
+
fig.add_trace(
|
318 |
+
go.Bar(x=new_df.loc[n]['words'], y=new_df.loc[n]['probability']),
|
319 |
+
row=i, col=j
|
320 |
+
)
|
321 |
+
|
322 |
+
n += 1
|
323 |
+
|
324 |
+
fig.update_layout(height=1000, width=1400, title_text="Topic Word Probabilities", showlegend=False, margin=dict(b=5))
|
325 |
+
|
326 |
+
return fig
|
327 |
+
|
328 |
+
def load_tSNE(best_model, bow_corpus):
|
329 |
+
# Get topic weights
|
330 |
+
topic_weights = []
|
331 |
+
for i, row_list in enumerate(best_model[bow_corpus]):
|
332 |
+
topic_weights.append([w for i, w in row_list])
|
333 |
+
|
334 |
+
# Array of topic weights
|
335 |
+
arr = pd.DataFrame(topic_weights).fillna(0).values
|
336 |
+
|
337 |
+
# Keep the well separated points (optional)
|
338 |
+
arr = arr[np.amax(arr, axis=1) > 0.35]
|
339 |
+
|
340 |
+
# Dominant topic number in each doc
|
341 |
+
topic_num = np.argmax(arr, axis=1)
|
342 |
+
|
343 |
+
# tSNE Dimension Reduction
|
344 |
+
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
|
345 |
+
tsne_lda = tsne_model.fit_transform(arr)
|
346 |
+
|
347 |
+
# Plot the Topic Clusters using Bokeh
|
348 |
+
colors = ['peachpuff','lightskyblue','turquoise','darkorange','purple','olive','lightgreen','darkseagreen','maroon','teal',
|
349 |
+
'deepskyblue','red','mediumblue','indigo','goldenrod','mediumvioletred','pink','beige','rosybrown']
|
350 |
+
n_topics = 4
|
351 |
+
mycolors = np.array([color for color in colors])
|
352 |
+
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics),
|
353 |
+
plot_width=900, plot_height=700)
|
354 |
+
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
|
355 |
+
|
356 |
+
return plot
|
357 |
+
|
358 |
+
site = st.selectbox(
|
359 |
+
'Select which site to analyze topics',
|
360 |
+
('Popular Science', 'Discover Magazine', 'Cosmos Magazine'),
|
361 |
+
)
|
362 |
+
|
363 |
+
vert_space = '<div style="padding: 20px 5px;"></div>'
|
364 |
+
st.markdown(vert_space, unsafe_allow_html=True)
|
365 |
+
|
366 |
+
if site:
|
367 |
+
if site == 'Popular Science':
|
368 |
+
site = 'PopSci'
|
369 |
+
elif site == 'Discover Magazine':
|
370 |
+
site = 'Discover'
|
371 |
+
elif site == 'Cosmos Magazine':
|
372 |
+
site = 'Cosmos'
|
373 |
+
|
374 |
+
mp_df, mt_df = load_mpmt(site)
|
375 |
+
|
376 |
+
st.subheader("How good is the model?")
|
377 |
+
|
378 |
+
passes_graph, passes_vert = load_evaluation_graph(mp_df, 'Number of Passes', 'Topic Coherence', 'Topic Coherence vs Number of Passes' )
|
379 |
+
passes_graph.update_layout(width=650)
|
380 |
+
|
381 |
+
topics_graph, topics_vert = load_evaluation_graph(mt_df, 'Number of Topics', 'Topic Coherence', 'Topic Coherence vs Number of Topics' )
|
382 |
+
topics_graph.update_layout(width=650)
|
383 |
+
|
384 |
+
mdt_best = round(mt_df['coherence'].max(),4)
|
385 |
+
|
386 |
+
st.markdown(f"The **:blue[best performing model]** obtained a coherence score of **:blue[{mdt_best}]** ! \n \
|
387 |
+
The model performed best with {passes_vert} iterations over the whole corpus and {topics_vert} number of topics.")
|
388 |
+
|
389 |
+
col1, col2 = st.columns(2)
|
390 |
+
|
391 |
+
with col1:
|
392 |
+
st.write(passes_graph)
|
393 |
+
|
394 |
+
with col2:
|
395 |
+
st.write(topics_graph)
|
396 |
+
|
397 |
+
ex_df, best_model, bow_corpus, dictionary = load_ex(site)
|
398 |
+
|
399 |
+
st.subheader("The model were also found to be performing better when extreme word occurrences are filtered!")
|
400 |
+
|
401 |
+
ex_best = round(ex_df['coherence'].max(), 4)
|
402 |
+
imp = round(ex_best / mdt_best, 4)
|
403 |
+
|
404 |
+
st.markdown(f"This time, the **:blue[best performing model]** obtained a coherence score of **:blue[{ex_best}]**. \n \
|
405 |
+
An increase of another **:blue[{imp}]**% !")
|
406 |
+
|
407 |
+
best_graph, best_vert = load_evaluation_graph(ex_df, 'Percentage of Documents Used to Filter', 'Topic Coherence', 'Topic Coherence vs Percentage of Documents' )
|
408 |
+
|
409 |
+
best_graph.update_layout(width=1400)
|
410 |
+
|
411 |
+
st.write(best_graph)
|
412 |
+
|
413 |
+
#col1, col2 = st.columns(2)
|
414 |
+
|
415 |
+
processed_series = load_model(site)
|
416 |
+
|
417 |
+
if site == 'PopSci':
|
418 |
+
site = 'Popular Science'
|
419 |
+
elif site == 'Discover':
|
420 |
+
site = 'Discover Magazine'
|
421 |
+
elif site == 'Cosmos':
|
422 |
+
site = 'Cosmos Magazine'
|
423 |
+
|
424 |
+
document_count, fifth, ninefifth = load_document_count(processed_series)
|
425 |
+
topic_document_count, top_3, top_i = load_topic_document_count(best_model, bow_corpus)
|
426 |
+
|
427 |
+
top_3 = top_3.split(',')
|
428 |
+
|
429 |
+
st.subheader("How long are the documents?")
|
430 |
+
|
431 |
+
st.markdown(f"Most documents in {site} are between **:blue[{fifth}]** and **:blue[{ninefifth}]** words long!")
|
432 |
+
|
433 |
+
st.write(document_count)
|
434 |
+
|
435 |
+
st.subheader(f"What are the most discussed topics in {site}?")
|
436 |
+
|
437 |
+
st.markdown(f"The most discussed topics are related to the keywords **:blue[{top_3[0].upper()}]**, **:blue[{top_3[1].upper()}]** and **:blue[{top_3[2].upper()}]**")
|
438 |
+
st.write(topic_document_count)
|
439 |
+
|
440 |
+
if site == 'Popular Science':
|
441 |
+
site = 'PopSci'
|
442 |
+
elif site == 'Discover Magazine':
|
443 |
+
site = 'Discover'
|
444 |
+
elif site == 'Cosmos Magazine':
|
445 |
+
site = 'Cosmos'
|
446 |
+
|
447 |
+
related_url = load_related(site, bow_corpus, top_i)
|
448 |
+
|
449 |
+
st.subheader("These articles have the highest probability of having above topic!")
|
450 |
+
|
451 |
+
st.markdown('<div style="padding: 25px 5px;"></div>', unsafe_allow_html=True)
|
452 |
+
|
453 |
+
st.write(related_url, width=1000)
|
454 |
+
|
455 |
+
st.markdown('<div style="padding: 25px 5px;"></div>', unsafe_allow_html=True)
|
456 |
+
|
457 |
+
st.subheader("Explore the topics below!")
|
458 |
+
|
459 |
+
st.markdown(vert_space, unsafe_allow_html=True)
|
460 |
+
|
461 |
+
if site == 'PopSci':
|
462 |
+
site = 'Popular Science'
|
463 |
+
elif site == 'Discover':
|
464 |
+
site = 'Discover Magazine'
|
465 |
+
elif site == 'Cosmos':
|
466 |
+
site = 'Cosmos Magazine'
|
467 |
+
|
468 |
+
load_cloud_each(best_model, site)
|
469 |
+
|
470 |
+
st.markdown('<div style="padding: 40px 5px;"></div>', unsafe_allow_html=True)
|
471 |
+
|
472 |
+
lda_vis = load_LDAvis(best_model, bow_corpus, dictionary)
|
473 |
+
#st.write(lda_vis)
|
474 |
+
|
475 |
+
st.subheader("LDAVis Visualization")
|
476 |
+
st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
|
477 |
+
st.components.v1.html(lda_vis, height=1100, width=1400)
|