Spaces:

khunamir
/

snt-topic-modeling

Runtime error

App Files Files Community

khunamir commited on Aug 9, 2023

Commit

3bdb790

•

1 Parent(s): bfdd99d

First Commit

Browse files

Files changed (11) hide show

.gitignore +1 -0
CosmosData/Data.pkl +3 -0
CosmosData/SciData.pkl +3 -0
CosmosData/SciTechData.pkl +3 -0
CosmosData/TechData.pkl +3 -0
CosmosData/bow_corpus.pkl +3 -0
CosmosData/dictionary.pkl +3 -0
CosmosData/preprocessed_scitech.pkl +3 -0
CosmosData/test_data.pkl +3 -0
CosmosData/train_data.pkl +3 -0
app.py +477 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ Viz

CosmosData/Data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10d412f2c213194de8fad35f72b2b89a2296791586efc2896cc37ae24d8cfee0
+size 6097905

CosmosData/SciData.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8000870576a832b5687cbc9f79c6bf58d0fc812b86732627753369eacd7c39cd
+size 12995836

CosmosData/SciTechData.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11ac166393d5c6c9a6fc0d5300402df108c73853ebaa3c00654703bc0d485fad
+size 18207515

CosmosData/TechData.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a71736ae1115b557e58472c0e2cf0726ad0b02ddd22daeb30c49c4dca9b7220
+size 6101253

CosmosData/bow_corpus.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:236080fb44c7b688b1939b8463029b082ea37c27c872fe39a3c378395eafea49
+size 5583422

CosmosData/dictionary.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:365560d76985137c95f2007dc40e7f6fef511693ef26b7b1ad1393a9b539143c
+size 290617

CosmosData/preprocessed_scitech.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2dd754280f5543895399e85b3c78aa295a73c8dda23dadbc6db85fb155efa09
+size 13832946

CosmosData/test_data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdbc5902b11327dbb9edd8c5c254d5c7aa35e0a6e8eca4f2f67cf5b659169039
+size 2874718

CosmosData/train_data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53b3018b3d5e26d7e3ca900adf8ade7a05dfe6f143e7b1bc024a642311ad49ba
+size 11024839

app.py ADDED Viewed

	@@ -0,0 +1,477 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import re
+import math
+import gensim
+import pickle
+import pyLDAvis
+import pyLDAvis.gensim_models as gensimvis
+import plotly.express as px
+import plotly.graph_objects as go
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+from bokeh.plotting import figure, output_file, show
+from bokeh.models import Label
+from bokeh.io import output_notebook
+from plotly.subplots import make_subplots
+from pandasgui import show
+from sklearn.manifold import TSNE
+from sklearn.model_selection import train_test_split
+from gensim.parsing.preprocessing import STOPWORDS
+from wordcloud import WordCloud
+colors = ['peachpuff','lightskyblue','turquoise','darkorange','purple','olive','lightgreen','darkseagreen','maroon','teal',
+          'deepskyblue','red','mediumblue','indigo','goldenrod','mediumvioletred','pink','beige','rosybrown']
+st.set_page_config(layout="wide")
+st.markdown("<h1 style='font-weight: normal'><b>Topic Model</b>: Science and Technology News</h1>", unsafe_allow_html=True)
+def load_mpmt(site):
+    with open(f'./Models/{site}Models/{site.lower()}_lda_passes_train.pickle', 'rb') as file:
+        model_passes = pickle.load(file)
+    with open(f'./Models/{site}Models/{site.lower()}_lda_topics_train.pickle', 'rb') as file:
+        model_topics = pickle.load(file)
+    mp_df = pd.DataFrame(model_passes)
+    mp_df = mp_df.transpose()
+    mp_df = mp_df.iloc[0:50]
+    mp_df['coherence'] = mp_df['coherence'].astype(float)
+    mt_df = pd.DataFrame(model_topics)
+    mt_df = mt_df.transpose()
+    mt_df = mt_df.iloc[0:50]
+    mt_df['coherence'] = mt_df['coherence'].astype(float)
+    return mp_df, mt_df
+def load_ex(site):
+    with open(f'./Models/{site}Models/{site.lower()}_extreme2.pickle', 'rb') as file:
+        model_extreme = pickle.load(file)
+    ex_df = pd.DataFrame(model_extreme)
+    ex_df = ex_df.transpose()
+    ex_df['coherence'] = ex_df['coherence'].astype(float)
+    ex_df = ex_df.reset_index()
+    best_model = ex_df.iloc[ex_df['coherence'].idxmax()]['model']
+    bow_corpus = ex_df.iloc[ex_df['coherence'].idxmax()]['corpus']
+    dictionary = ex_df.iloc[ex_df['coherence'].idxmax()]['dictionary']
+    return ex_df, best_model, bow_corpus, dictionary
+def load_model(site):
+    with open(f'./{site}Data/preprocessed_scitech.pkl', 'rb') as file:
+        processed_series = pickle.load(file)
+    return processed_series
+def load_related(site, bow_corpus, highest_top):
+    with open(f"./{site}Data/SciTechData.pkl", "rb") as file:
+        news = pickle.load(file)
+    dm_topic = []
+    for i, corp in enumerate(bow_corpus):
+        topic_percs = best_model.get_document_topics(corp)
+        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
+        dm_topic.append(dominant_topic)
+    news['dominant_topic'] = dm_topic
+    return news[news['dominant_topic'] == highest_top]['url'][:10]
+def load_evaluation_graph(data, xlabel, ylabel, title):
+    if (len(data) > 25):
+        fig = px.line(data, x=range(1, len(data)+1), y='coherence', title=title, labels={'x': xlabel, 'y': ylabel})
+        fig.add_hline(y=data['coherence'].max())
+        try:
+            vert_value = int(data['coherence'].idxmax().split('a')[1])
+        except:
+            vert_value = int(data['coherence'].idxmax().split('s')[1])
+    else:
+        fig = px.line(data[::-1], x=range(30, 100, 10), y='coherence', title=title, labels={'x': xlabel, 'y': ylabel})
+        vert_value = int(data.reset_index()['coherence'].idxmax())
+        fig.update_xaxes(range=[30, 90])
+    fig.add_vline(x=vert_value)
+    return fig, vert_value
+def load_cloud(processed_series):
+    all_words = ''
+    stopwords = set(STOPWORDS)
+    for val in processed_series:
+        all_words += ' '.join(val)+' '
+    wordcloud = WordCloud(width = 1800, height = 1600,
+                    background_color ='white',
+                    stopwords = stopwords,
+                    min_font_size = 10).generate(all_words)
+    # fig = plt.figure(figsize = (8, 8), facecolor = None)
+    # ax = fig.add_axes([2, 2, 10, 10])
+    # ax.imshow(wordcloud)
+    # ax.axis("off")
+    # fig.tight_layout(pad = 0)
+    fig = px.imshow(wordcloud)
+    return fig
+def load_cloud_each(model, site):
+    if site == 'Popular Science' or site == 'Cosmos Magazine':
+        words = ['u']
+    elif site == 'Discover Magazine':
+        words = ['nt', 'u', 've', 'm', 'll', 'd', 'rofl']
+    stopwords = set(STOPWORDS)
+    for i in words:
+        stopwords.add(i)
+    num_topics = len(model.get_topics())
+    topic_top3words = [(i, topic) for i, topics in model.show_topics(formatted=False, num_topics=num_topics) for j, (topic, wt) in enumerate(topics) if j < 3]
+    k=0
+    new_list = []
+    new_new_list = []
+    j = 0
+    while (j < len(topic_top3words)):
+        i = topic_top3words[j][1]
+        if(j == len(topic_top3words)-1):
+            new_new_list.append(new_list)
+        if(k<3):
+            j += 1
+        else:
+            new_new_list.append(new_list)
+            new_list = []
+            k = 0
+            continue
+        new_list.append(i)
+        k += 1
+    cloud = WordCloud(stopwords=stopwords,
+                    background_color='white',
+                    width=750,
+                    height=750,
+                    max_words=10,
+                    colormap='tab10',
+                    color_func=lambda *args, **kwargs: color_func(*args, **kwargs, n=n, topics=new_new_list[n]),
+                    prefer_horizontal=1.0)
+    topics = model.show_topics(num_topics=num_topics, formatted=False)
+    j = 0
+    n = 0
+    col1, col2, col3, col4, col5 = st.columns(5)
+    while n < num_topics:
+        if (j < 5):
+            if (j == 0):
+                col = col1
+            elif (j == 1):
+                col = col2
+            elif (j == 2):
+                col = col3
+            elif (j == 3):
+                col = col4
+            elif (j == 4):
+                col = col5
+        else:
+            j = 0
+            col1, col2, col3, col4, col5 = st.columns(5)
+            continue
+        with col:
+            fig = plt.figure(figsize=(1.5,1.5))
+            plt.title('Topic ' + str(n+1), fontdict=dict(size=6))
+            plt.axis('off')
+            topic_words = dict(topics[n][1])
+            cloud.generate_from_frequencies(topic_words, max_font_size=400)
+            plt.imshow(cloud)
+            st.write(fig)
+        j += 1
+        n += 1
+def load_LDAvis(model, corpus, dictionary):
+    vis = gensimvis.prepare(model, corpus, dictionary)
+    html_string = pyLDAvis.prepared_data_to_html(vis)
+    return html_string
+def load_topic_document_count(best_model, bow_corpus):
+    dm_topic = []
+    for i, corp in enumerate(bow_corpus):
+        topic_percs = best_model.get_document_topics(corp)
+        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
+        dm_topic.append(dominant_topic)
+    dm_df = pd.DataFrame(dm_topic, columns=['dominant_topic'])
+    topic_top3words = [(i, topic) for i, topics in best_model.show_topics(formatted=False, num_topics=-1) for j, (topic, wt) in enumerate(topics) if j < 3]
+    df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
+    df_top3words = df_top3words_stacked.groupby('topic_id').agg(', '.join)
+    df_top3words.reset_index(level=0,inplace=True)
+    count_df = pd.DataFrame(dm_df.groupby('dominant_topic').dominant_topic.agg('count').to_frame('COUNT').reset_index()['COUNT'])
+    count_df['top3'] = list(df_top3words['words'])
+    fig = px.histogram(dm_df,
+                        x='dominant_topic',
+                        labels={'dominant_topic': 'Dominant topic', 'count': 'Number of Documents'},
+                        height=500,
+                        width=1400,
+                        title='Documents Count by Dominant Topic')
+    fig.update_layout(yaxis_title='Number of Documents', bargap=0.2)
+    fig.update_layout(
+        margin=dict(b=40),
+        xaxis = dict(
+            tickmode = 'array',
+            tickvals = list(range(dm_df['dominant_topic'].max()+1)),
+            ticktext = df_top3words['words']
+        )
+    )
+    return fig, count_df[count_df['COUNT'] == count_df['COUNT'].max()]['top3'].values[0], count_df['COUNT'].idxmax()
+def load_document_count(data):
+    doc_len = [len(d) for d in data]
+    fifth = round(np.quantile(doc_len, q=0.05))
+    ninefifth = round(np.quantile(doc_len, q=0.95))
+    text = "Mean  : " + str(round(np.mean(doc_len))) \
+       + "<br>Median  : " + str(round(np.median(doc_len))) \
+       + "<br>Std dev.  : " + str(round(np.std(doc_len))) \
+       + "<br>5th percentile  : " + str(round(np.quantile(doc_len, q=0.05))) \
+       + "<br>95th percentile  : " + str(round(np.quantile(doc_len, q=0.95)))
+    fig = px.histogram(doc_len, labels={"value": "Document Word Count"}, height=500, width=1400, title='Distribution of Documents Word Count')
+    fig.add_annotation(x=0.95, xref='paper', y=0.95, yref='paper', text=text, showarrow=False, bgcolor="#F4F4F4", opacity=0.8, borderpad=8, borderwidth=2, bordercolor="#DDDDDD", align='left')
+    fig.update_layout(yaxis_title='Number of Documents', showlegend=False)
+    return fig, fifth, ninefifth
+def color_func(word, font_size, position, orientation, font_path, random_state, n, topics):
+    if word in topics:
+        return colors[n]
+    else:
+        return 'lightgrey'
+def load_topic_word_prob(best_model):
+    topic_prob_list = [i[1].split(',') for i in best_model.show_topics(num_topics=-1)]
+    prob_list = []
+    words_list = []
+    for i in topic_prob_list:
+        num_list = re.findall(r'[\d]*[.][\d]+', *i)
+        conv = [float(j) for j in num_list]
+        prob_list.append(conv)
+        words = re.findall(r'"(.*?)"', *i)
+        words_list.append(words)
+    def flatten(l):
+        return [item for sublist in l for item in sublist]
+    words_list = flatten(words_list)
+    topnum_list = sorted(list(range(best_model.num_topics)) * 10)
+    prob_list = flatten(prob_list)
+    data = {
+        "topic": topnum_list,
+        "words": words_list,
+        "probability": prob_list
+    }
+    topic_prob = pd.DataFrame(data)
+    new_df = topic_prob.set_index(['topic'])
+    rows = math.ceil(best_model.num_topics / 5)
+    fig = make_subplots(
+        rows=rows,
+        cols=5,
+        shared_yaxes=True,
+        subplot_titles=[f'Topic {n}' for n in range(1, best_model.num_topics+1)]
+    )
+    j = 1
+    n = 0
+    for i in range(1, rows+1):
+        for j in range(1, 6):
+            if (n < best_model.num_topics):
+                fig.add_trace(
+                    go.Bar(x=new_df.loc[n]['words'], y=new_df.loc[n]['probability']),
+                    row=i, col=j
+                )
+                n += 1
+    fig.update_layout(height=1000, width=1400, title_text="Topic Word Probabilities", showlegend=False, margin=dict(b=5))
+    return fig
+def load_tSNE(best_model, bow_corpus):
+    # Get topic weights
+    topic_weights = []
+    for i, row_list in enumerate(best_model[bow_corpus]):
+        topic_weights.append([w for i, w in row_list])
+    # Array of topic weights
+    arr = pd.DataFrame(topic_weights).fillna(0).values
+    # Keep the well separated points (optional)
+    arr = arr[np.amax(arr, axis=1) > 0.35]
+    # Dominant topic number in each doc
+    topic_num = np.argmax(arr, axis=1)
+    # tSNE Dimension Reduction
+    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
+    tsne_lda = tsne_model.fit_transform(arr)
+    # Plot the Topic Clusters using Bokeh
+    colors = ['peachpuff','lightskyblue','turquoise','darkorange','purple','olive','lightgreen','darkseagreen','maroon','teal',
+            'deepskyblue','red','mediumblue','indigo','goldenrod','mediumvioletred','pink','beige','rosybrown']
+    n_topics = 4
+    mycolors = np.array([color for color in colors])
+    plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics),
+                plot_width=900, plot_height=700)
+    plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
+    return plot
+site = st.selectbox(
+    'Select which site to analyze topics',
+    ('Popular Science', 'Discover Magazine', 'Cosmos Magazine'),
+)
+vert_space = '<div style="padding: 20px 5px;"></div>'
+st.markdown(vert_space, unsafe_allow_html=True)
+if site:
+    if site == 'Popular Science':
+        site = 'PopSci'
+    elif site == 'Discover Magazine':
+        site = 'Discover'
+    elif site == 'Cosmos Magazine':
+        site = 'Cosmos'
+    mp_df, mt_df = load_mpmt(site)
+    st.subheader("How good is the model?")
+    passes_graph, passes_vert = load_evaluation_graph(mp_df, 'Number of Passes', 'Topic Coherence', 'Topic Coherence vs Number of Passes' )
+    passes_graph.update_layout(width=650)
+    topics_graph, topics_vert = load_evaluation_graph(mt_df, 'Number of Topics', 'Topic Coherence', 'Topic Coherence vs Number of Topics' )
+    topics_graph.update_layout(width=650)
+    mdt_best = round(mt_df['coherence'].max(),4)
+    st.markdown(f"The **:blue[best performing model]** obtained a coherence score of **:blue[{mdt_best}]** !  \n \
+        The model performed best with {passes_vert} iterations over the whole corpus and {topics_vert} number of topics.")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write(passes_graph)
+    with col2:
+        st.write(topics_graph)
+    ex_df, best_model, bow_corpus, dictionary = load_ex(site)
+    st.subheader("The model were also found to be performing better when extreme word occurrences are filtered!")
+    ex_best = round(ex_df['coherence'].max(), 4)
+    imp = round(ex_best / mdt_best, 4)
+    st.markdown(f"This time, the **:blue[best performing model]** obtained a coherence score of **:blue[{ex_best}]**. \n \
+        An increase of another **:blue[{imp}]**% !")
+    best_graph, best_vert = load_evaluation_graph(ex_df, 'Percentage of Documents Used to Filter', 'Topic Coherence', 'Topic Coherence vs Percentage of Documents' )
+    best_graph.update_layout(width=1400)
+    st.write(best_graph)
+    #col1, col2 = st.columns(2)
+    processed_series = load_model(site)
+    if site == 'PopSci':
+        site = 'Popular Science'
+    elif site == 'Discover':
+        site = 'Discover Magazine'
+    elif site == 'Cosmos':
+        site = 'Cosmos Magazine'
+    document_count, fifth, ninefifth = load_document_count(processed_series)
+    topic_document_count, top_3, top_i = load_topic_document_count(best_model, bow_corpus)
+    top_3 = top_3.split(',')
+    st.subheader("How long are the documents?")
+    st.markdown(f"Most documents in {site} are between **:blue[{fifth}]** and **:blue[{ninefifth}]** words long!")
+    st.write(document_count)
+    st.subheader(f"What are the most discussed topics in {site}?")
+    st.markdown(f"The most discussed topics are related to the keywords **:blue[{top_3[0].upper()}]**, **:blue[{top_3[1].upper()}]** and **:blue[{top_3[2].upper()}]**")
+    st.write(topic_document_count)
+    if site == 'Popular Science':
+        site = 'PopSci'
+    elif site == 'Discover Magazine':
+        site = 'Discover'
+    elif site == 'Cosmos Magazine':
+        site = 'Cosmos'
+    related_url = load_related(site, bow_corpus, top_i)
+    st.subheader("These articles have the highest probability of having above topic!")
+    st.markdown('<div style="padding: 25px 5px;"></div>', unsafe_allow_html=True)
+    st.write(related_url, width=1000)
+    st.markdown('<div style="padding: 25px 5px;"></div>', unsafe_allow_html=True)
+    st.subheader("Explore the topics below!")
+    st.markdown(vert_space, unsafe_allow_html=True)
+    if site == 'PopSci':
+        site = 'Popular Science'
+    elif site == 'Discover':
+        site = 'Discover Magazine'
+    elif site == 'Cosmos':
+        site = 'Cosmos Magazine'
+    load_cloud_each(best_model, site)
+    st.markdown('<div style="padding: 40px 5px;"></div>', unsafe_allow_html=True)
+    lda_vis = load_LDAvis(best_model, bow_corpus, dictionary)
+    #st.write(lda_vis)
+    st.subheader("LDAVis Visualization")
+    st.markdown('<div style="padding: 20px 5px;"></div>', unsafe_allow_html=True)
+    st.components.v1.html(lda_vis, height=1100, width=1400)