Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import math | |
| import gensim | |
| import pickle | |
| import pyLDAvis | |
| import pyLDAvis.gensim_models as gensimvis | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import matplotlib.pyplot as plt | |
| import matplotlib.colors as mcolors | |
| from bokeh.plotting import figure, output_file, show | |
| from bokeh.models import Label | |
| from bokeh.io import output_notebook | |
| from plotly.subplots import make_subplots | |
| from pandasgui import show | |
| from sklearn.manifold import TSNE | |
| from sklearn.model_selection import train_test_split | |
| from gensim.parsing.preprocessing import STOPWORDS | |
| from wordcloud import WordCloud | |
| colors = ['peachpuff','lightskyblue','turquoise','darkorange','purple','olive','lightgreen','darkseagreen','maroon','teal', | |
| 'deepskyblue','red','mediumblue','indigo','goldenrod','mediumvioletred','pink','beige','rosybrown'] | |
| st.set_page_config(layout="wide") | |
| st.markdown("<h1 style='font-weight: normal'><b>Topic Model</b>: Science and Technology News</h1>", unsafe_allow_html=True) | |
| def load_mpmt(site): | |
| with open(f'./Models/{site}Models/{site.lower()}_lda_passes_train.pickle', 'rb') as file: | |
| model_passes = pickle.load(file) | |
| with open(f'./Models/{site}Models/{site.lower()}_lda_topics_train.pickle', 'rb') as file: | |
| model_topics = pickle.load(file) | |
| mp_df = pd.DataFrame(model_passes) | |
| mp_df = mp_df.transpose() | |
| mp_df = mp_df.iloc[0:50] | |
| mp_df['coherence'] = mp_df['coherence'].astype(float) | |
| mt_df = pd.DataFrame(model_topics) | |
| mt_df = mt_df.transpose() | |
| mt_df = mt_df.iloc[0:50] | |
| mt_df['coherence'] = mt_df['coherence'].astype(float) | |
| return mp_df, mt_df | |
| def load_ex(site): | |
| with open(f'./Models/{site}Models/{site.lower()}_extreme2.pickle', 'rb') as file: | |
| model_extreme = pickle.load(file) | |
| ex_df = pd.DataFrame(model_extreme) | |
| ex_df = ex_df.transpose() | |
| ex_df['coherence'] = ex_df['coherence'].astype(float) | |
| ex_df = ex_df.reset_index() | |
| best_model = ex_df.iloc[ex_df['coherence'].idxmax()]['model'] | |
| bow_corpus = ex_df.iloc[ex_df['coherence'].idxmax()]['corpus'] | |
| dictionary = ex_df.iloc[ex_df['coherence'].idxmax()]['dictionary'] | |
| return ex_df, best_model, bow_corpus, dictionary | |
| def load_model(site): | |
| with open(f'./{site}Data/preprocessed_scitech.pkl', 'rb') as file: | |
| processed_series = pickle.load(file) | |
| return processed_series | |
| def load_related(site, bow_corpus, highest_top): | |
| with open(f"./{site}Data/SciTechData.pkl", "rb") as file: | |
| news = pickle.load(file) | |
| dm_topic = [] | |
| for i, corp in enumerate(bow_corpus): | |
| topic_percs = best_model.get_document_topics(corp) | |
| dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0] | |
| dm_topic.append(dominant_topic) | |
| news['dominant_topic'] = dm_topic | |
| return news[news['dominant_topic'] == highest_top]['url'][:10] | |
| def load_evaluation_graph(data, xlabel, ylabel, title): | |
| if (len(data) > 25): | |
| fig = px.line(data, x=range(1, len(data)+1), y='coherence', title=title, labels={'x': xlabel, 'y': ylabel}) | |
| fig.add_hline(y=data['coherence'].max()) | |
| try: | |
| vert_value = int(data['coherence'].idxmax().split('a')[1]) | |
| except: | |
| vert_value = int(data['coherence'].idxmax().split('s')[1]) | |
| else: | |
| fig = px.line(data[::-1], x=range(30, 100, 10), y='coherence', title=title, labels={'x': xlabel, 'y': ylabel}) | |
| vert_value = int(data.reset_index()['coherence'].idxmax()) | |
| fig.update_xaxes(range=[30, 90]) | |
| fig.add_vline(x=vert_value) | |
| return fig, vert_value | |
| def load_cloud(processed_series): | |
| all_words = '' | |
| stopwords = set(STOPWORDS) | |
| for val in processed_series: | |
| all_words += ' '.join(val)+' ' | |
| wordcloud = WordCloud(width = 1800, height = 1600, | |
| background_color ='white', | |
| stopwords = stopwords, | |
| min_font_size = 10).generate(all_words) | |
| # fig = plt.figure(figsize = (8, 8), facecolor = None) | |
| # ax = fig.add_axes([2, 2, 10, 10]) | |
| # ax.imshow(wordcloud) | |
| # ax.axis("off") | |
| # fig.tight_layout(pad = 0) | |
| fig = px.imshow(wordcloud) | |
| return fig | |
| def load_cloud_each(model, site): | |
| if site == 'Popular Science' or site == 'Cosmos Magazine': | |
| words = ['u'] | |
| elif site == 'Discover Magazine': | |
| words = ['nt', 'u', 've', 'm', 'll', 'd', 'rofl'] | |
| stopwords = set(STOPWORDS) | |
| for i in words: | |
| stopwords.add(i) | |
| num_topics = len(model.get_topics()) | |
| topic_top3words = [(i, topic) for i, topics in model.show_topics(formatted=False, num_topics=num_topics) for j, (topic, wt) in enumerate(topics) if j < 3] | |
| k=0 | |
| new_list = [] | |
| new_new_list = [] | |
| j = 0 | |
| while (j < len(topic_top3words)): | |
| i = topic_top3words[j][1] | |
| if(j == len(topic_top3words)-1): | |
| new_new_list.append(new_list) | |
| if(k<3): | |
| j += 1 | |
| else: | |
| new_new_list.append(new_list) | |
| new_list = [] | |
| k = 0 | |
| continue | |
| new_list.append(i) | |
| k += 1 | |
| cloud = WordCloud(stopwords=stopwords, | |
| background_color='white', | |
| width=750, | |
| height=750, | |
| max_words=10, | |
| colormap='tab10', | |
| color_func=lambda *args, **kwargs: color_func(*args, **kwargs, n=n, topics=new_new_list[n]), | |
| prefer_horizontal=1.0) | |
| topics = model.show_topics(num_topics=num_topics, formatted=False) | |
| j = 0 | |
| n = 0 | |
| col1, col2, col3, col4, col5 = st.columns(5) | |
| while n < num_topics: | |
| if (j < 5): | |
| if (j == 0): | |
| col = col1 | |
| elif (j == 1): | |
| col = col2 | |
| elif (j == 2): | |
| col = col3 | |
| elif (j == 3): | |
| col = col4 | |
| elif (j == 4): | |
| col = col5 | |
| else: | |
| j = 0 | |
| col1, col2, col3, col4, col5 = st.columns(5) | |
| continue | |
| with col: | |
| fig = plt.figure(figsize=(1.5,1.5)) | |
| plt.title('Topic ' + str(n+1), fontdict=dict(size=6)) | |
| plt.axis('off') | |
| topic_words = dict(topics[n][1]) | |
| cloud.generate_from_frequencies(topic_words, max_font_size=400) | |
| plt.imshow(cloud) | |
| st.write(fig) | |
| j += 1 | |
| n += 1 | |
| def load_LDAvis(model, corpus, dictionary): | |
| vis = gensimvis.prepare(model, corpus, dictionary) | |
| html_string = pyLDAvis.prepared_data_to_html(vis) | |
| return html_string | |
| def load_topic_document_count(best_model, bow_corpus): | |
| dm_topic = [] | |
| for i, corp in enumerate(bow_corpus): | |
| topic_percs = best_model.get_document_topics(corp) | |
| dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0] | |
| dm_topic.append(dominant_topic) | |
| dm_df = pd.DataFrame(dm_topic, columns=['dominant_topic']) | |
| topic_top3words = [(i, topic) for i, topics in best_model.show_topics(formatted=False, num_topics=-1) for j, (topic, wt) in enumerate(topics) if j < 3] | |
| df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words']) | |
| df_top3words = df_top3words_stacked.groupby('topic_id').agg(', '.join) | |
| df_top3words.reset_index(level=0,inplace=True) | |
| count_df = pd.DataFrame(dm_df.groupby('dominant_topic').dominant_topic.agg('count').to_frame('COUNT').reset_index()['COUNT']) | |
| count_df['top3'] = list(df_top3words['words']) | |
| fig = px.histogram(dm_df, | |
| x='dominant_topic', | |
| labels={'dominant_topic': 'Dominant topic', 'count': 'Number of Documents'}, | |
| height=500, | |
| width=1400, | |
| title='Documents Count by Dominant Topic') | |
| fig.update_layout(yaxis_title='Number of Documents', bargap=0.2) | |
| fig.update_layout( | |
| margin=dict(b=40), | |
| xaxis = dict( | |
| tickmode = 'array', | |
| tickvals = list(range(dm_df['dominant_topic'].max()+1)), | |
| ticktext = df_top3words['words'] | |
| ) | |
| ) | |
| return fig, count_df[count_df['COUNT'] == count_df['COUNT'].max()]['top3'].values[0], count_df['COUNT'].idxmax() | |
| def load_document_count(data): | |
| doc_len = [len(d) for d in data] | |
| fifth = round(np.quantile(doc_len, q=0.05)) | |
| ninefifth = round(np.quantile(doc_len, q=0.95)) | |
| text = "Mean : " + str(round(np.mean(doc_len))) \ | |
| + "<br>Median : " + str(round(np.median(doc_len))) \ | |
| + "<br>Std dev. : " + str(round(np.std(doc_len))) \ | |
| + "<br>5th percentile : " + str(round(np.quantile(doc_len, q=0.05))) \ | |
| + "<br>95th percentile : " + str(round(np.quantile(doc_len, q=0.95))) | |
| fig = px.histogram(doc_len, labels={"value": "Document Word Count"}, height=500, width=1400, title='Distribution of Documents Word Count') | |
| fig.add_annotation(x=0.95, xref='paper', y=0.95, yref='paper', text=text, showarrow=False, bgcolor="#F4F4F4", opacity=0.8, borderpad=8, borderwidth=2, bordercolor="#DDDDDD", align='left') | |
| fig.update_layout(yaxis_title='Number of Documents', showlegend=False) | |
| return fig, fifth, ninefifth | |
| def color_func(word, font_size, position, orientation, font_path, random_state, n, topics): | |
| if word in topics: | |
| return colors[n] | |
| else: | |
| return 'lightgrey' | |
| def load_topic_word_prob(best_model): | |
| topic_prob_list = [i[1].split(',') for i in best_model.show_topics(num_topics=-1)] | |
| prob_list = [] | |
| words_list = [] | |
| for i in topic_prob_list: | |
| num_list = re.findall(r'[\d]*[.][\d]+', *i) | |
| conv = [float(j) for j in num_list] | |
| prob_list.append(conv) | |
| words = re.findall(r'"(.*?)"', *i) | |
| words_list.append(words) | |
| def flatten(l): | |
| return [item for sublist in l for item in sublist] | |
| words_list = flatten(words_list) | |
| topnum_list = sorted(list(range(best_model.num_topics)) * 10) | |
| prob_list = flatten(prob_list) | |
| data = { | |
| "topic": topnum_list, | |
| "words": words_list, | |
| "probability": prob_list | |
| } | |
| topic_prob = pd.DataFrame(data) | |
| new_df = topic_prob.set_index(['topic']) | |
| rows = math.ceil(best_model.num_topics / 5) | |
| fig = make_subplots( | |
| rows=rows, | |
| cols=5, | |
| shared_yaxes=True, | |
| subplot_titles=[f'Topic {n}' for n in range(1, best_model.num_topics+1)] | |
| ) | |
| j = 1 | |
| n = 0 | |
| for i in range(1, rows+1): | |
| for j in range(1, 6): | |
| if (n < best_model.num_topics): | |
| fig.add_trace( | |
| go.Bar(x=new_df.loc[n]['words'], y=new_df.loc[n]['probability']), | |
| row=i, col=j | |
| ) | |
| n += 1 | |
| fig.update_layout(height=1000, width=1400, title_text="Topic Word Probabilities", showlegend=False, margin=dict(b=5)) | |
| return fig | |
| def load_tSNE(best_model, bow_corpus): | |
| # Get topic weights | |
| topic_weights = [] | |
| for i, row_list in enumerate(best_model[bow_corpus]): | |
| topic_weights.append([w for i, w in row_list]) | |
| # Array of topic weights | |
| arr = pd.DataFrame(topic_weights).fillna(0).values | |
| # Keep the well separated points (optional) | |
| arr = arr[np.amax(arr, axis=1) > 0.35] | |
| # Dominant topic number in each doc | |
| topic_num = np.argmax(arr, axis=1) | |
| # tSNE Dimension Reduction | |
| tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca') | |
| tsne_lda = tsne_model.fit_transform(arr) | |
| # Plot the Topic Clusters using Bokeh | |
| colors = ['peachpuff','lightskyblue','turquoise','darkorange','purple','olive','lightgreen','darkseagreen','maroon','teal', | |
| 'deepskyblue','red','mediumblue','indigo','goldenrod','mediumvioletred','pink','beige','rosybrown'] | |
| n_topics = 4 | |
| mycolors = np.array([color for color in colors]) | |
| plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), | |
| plot_width=900, plot_height=700) | |
| plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num]) | |
| return plot | |
| site = st.selectbox( | |
| 'Select which site to analyze topics', | |
| ('Popular Science', 'Discover Magazine', 'Cosmos Magazine'), | |
| ) | |
| vert_space = '<div style="padding: 20px 5px;"></div>' | |
| st.markdown(vert_space, unsafe_allow_html=True) | |
| if site: | |
| if site == 'Popular Science': | |
| site = 'PopSci' | |
| elif site == 'Discover Magazine': | |
| site = 'Discover' | |
| elif site == 'Cosmos Magazine': | |
| site = 'Cosmos' | |
| mp_df, mt_df = load_mpmt(site) | |
| st.subheader("How good is the model?") | |
| passes_graph, passes_vert = load_evaluation_graph(mp_df, 'Number of Passes', 'Topic Coherence', 'Topic Coherence vs Number of Passes' ) | |
| passes_graph.update_layout(width=650) | |
| topics_graph, topics_vert = load_evaluation_graph(mt_df, 'Number of Topics', 'Topic Coherence', 'Topic Coherence vs Number of Topics' ) | |
| topics_graph.update_layout(width=650) | |
| mdt_best = round(mt_df['coherence'].max(),4) | |
| st.markdown(f"The **:blue[best performing model]** obtained a coherence score of **:blue[{mdt_best}]** ! \n \ | |
| The model performed best with {passes_vert} iterations over the whole corpus and {topics_vert} number of topics.") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write(passes_graph) | |
| with col2: | |
| st.write(topics_graph) | |
| ex_df, best_model, bow_corpus, dictionary = load_ex(site) | |
| st.subheader("The model were also found to be performing better when extreme word occurrences are filtered!") | |
| ex_best = round(ex_df['coherence'].max(), 4) | |
| imp = round(ex_best / mdt_best, 4) | |
| st.markdown(f"This time, the **:blue[best performing model]** obtained a coherence score of **:blue[{ex_best}]**. \n \ | |
| An increase of another **:blue[{imp}]**% !") | |
| best_graph, best_vert = load_evaluation_graph(ex_df, 'Percentage of Documents Used to Filter', 'Topic Coherence', 'Topic Coherence vs Percentage of Documents' ) | |
| best_graph.update_layout(width=1400) | |
| st.write(best_graph) | |
| #col1, col2 = st.columns(2) | |
| processed_series = load_model(site) | |
| if site == 'PopSci': | |
| site = 'Popular Science' | |
| elif site == 'Discover': | |
| site = 'Discover Magazine' | |
| elif site == 'Cosmos': | |
| site = 'Cosmos Magazine' | |
| document_count, fifth, ninefifth = load_document_count(processed_series) | |
| topic_document_count, top_3, top_i = load_topic_document_count(best_model, bow_corpus) | |
| top_3 = top_3.split(',') | |
| st.subheader("How long are the documents?") | |
| st.markdown(f"Most documents in {site} are between **:blue[{fifth}]** and **:blue[{ninefifth}]** words long!") | |
| st.write(document_count) | |
| st.subheader(f"What are the most discussed topics in {site}?") | |
| st.markdown(f"The most discussed topics are related to the keywords **:blue[{top_3[0].upper()}]**, **:blue[{top_3[1].upper()}]** and **:blue[{top_3[2].upper()}]**") | |
| st.write(topic_document_count) | |
| if site == 'Popular Science': | |
| site = 'PopSci' | |
| elif site == 'Discover Magazine': | |
| site = 'Discover' | |
| elif site == 'Cosmos Magazine': | |
| site = 'Cosmos' | |
| related_url = load_related(site, bow_corpus, top_i) | |
| st.subheader("These articles have the highest probability of having above topic!") | |
| st.markdown('<div style="padding: 25px 5px;"></div>', unsafe_allow_html=True) | |
| st.write(related_url, width=1000) | |
| st.markdown('<div style="padding: 25px 5px;"></div>', unsafe_allow_html=True) | |
| st.subheader("Explore the topics below!") | |
| st.markdown(vert_space, unsafe_allow_html=True) | |
| if site == 'PopSci': | |
| site = 'Popular Science' | |
| elif site == 'Discover': | |
| site = 'Discover Magazine' | |
| elif site == 'Cosmos': | |
| site = 'Cosmos Magazine' | |
| load_cloud_each(best_model, site) | |
| st.markdown('<div style="padding: 40px 5px;"></div>', unsafe_allow_html=True) | |
| lda_vis = load_LDAvis(best_model, bow_corpus, dictionary) | |
| #st.write(lda_vis) | |
| st.subheader("LDAVis Visualization") | |
| st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True) | |
| st.components.v1.html(lda_vis, height=1100, width=1400) |