Spaces:

stephenleo
/

stripnet

Runtime error

App Files Files Community

stephenleo commited on Jan 1, 2022

Commit

d9f2adf

1 Parent(s): 576be81

many optimizations for streamlit

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +42 -17
helpers.py +79 -52

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py CHANGED Viewed

@@ -2,12 +2,22 @@ import networkx as nx
 from streamlit.components.v1 import html
 import streamlit as st
 import helpers
 st.set_page_config(layout='wide',
                    page_title='STriP: Semantic Similarity of Scientific Papers!',
                    page_icon='💡'
                    )
 def main():
     st.title('STriP (S3P): Semantic Similarity of Scientific Papers!')
@@ -18,39 +28,51 @@ def main():
     ##########
     # Load data
     ##########
     if uploaded_file is not None:
         df = helpers.load_data(uploaded_file)
     else:
         df = helpers.load_data('data.csv')
     data = df.copy()
     st.write(f'Number of papers: {len(data)}')
     st.write('First 5 rows of loaded data:')
-    st.write(data[['Title', 'Abstract']].head())
-    if data is not None:
         ##########
         # Topic modeling
         ##########
         st.header('🔥 Topic Modeling')
         cols = st.columns(3)
         with cols[0]:
             min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
-                                       max_value=int(len(data)/3), step=1, value=3,
                                        help='The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics.')
         with cols[1]:
             n_gram_range = st.slider('N-gram range', key='n_gram_range', min_value=1,
-                                     max_value=4, step=1, value=(1, 3),
                                      help='N-gram range for the topic model')
         with cols[2]:
             st.text('')
             st.text('')
             st.button('Reset Defaults', on_click=helpers.reset_default_topic_sliders, key='reset_topic_sliders',
-                      kwargs={'min_topic_size': 3, 'n_gram_range': (1, 3)})
         with st.spinner('Topic Modeling'):
-            data, topic_model, topics = helpers.topic_modeling(
                 data, min_topic_size=min_topic_size, n_gram_range=n_gram_range)
             mapping = {
@@ -65,7 +87,7 @@ def main():
                 topic_model_vis_option = st.selectbox(
                     'Select Topic Modeling Visualization', mapping.keys())
             try:
-                fig = mapping[topic_model_vis_option]()
                 fig.update_layout(title='')
                 st.plotly_chart(fig, use_container_width=True)
             except:
@@ -75,18 +97,18 @@ def main():
         ##########
         # STriP Network
         ##########
         st.header('🚀 STriP Network')
-        with st.spinner('Embedding generation'):
-            data = helpers.embeddings(data)
         with st.spinner('Cosine Similarity Calculation'):
             cosine_sim_matrix = helpers.cosine_sim(data)
-        min_value, value = helpers.calc_optimal_threshold(
             cosine_sim_matrix,
             # 25% is a good value for the number of papers
-            max_connections=helpers.calc_max_connections(len(data), 0.25)
         )
         cols = st.columns(3)
@@ -107,7 +129,7 @@ def main():
         with st.spinner('Network Generation'):
             nx_net, pyvis_net = helpers.network_plot(
-                data, topics, neighbors)
             # Save and read graph as HTML file (on Streamlit Sharing)
             try:
@@ -129,6 +151,7 @@ def main():
         ##########
         # Centrality
         ##########
         st.header('🏅 Most Important Papers')
         centrality_mapping = {
@@ -146,10 +169,12 @@ def main():
         # Calculate centrality
         centrality = centrality_mapping[centrality_option](nx_net)
-        with st.spinner('Network Centrality Calculation'):
-            fig = helpers.network_centrality(
-                data, centrality, centrality_option)
-            st.plotly_chart(fig, use_container_width=True)
     st.markdown(
         """

 from streamlit.components.v1 import html
 import streamlit as st
 import helpers
+import logging
+# Setup Basic Configuration
 st.set_page_config(layout='wide',
                    page_title='STriP: Semantic Similarity of Scientific Papers!',
                    page_icon='💡'
                    )
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s %(levelname)s: %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger('main')
 def main():
     st.title('STriP (S3P): Semantic Similarity of Scientific Papers!')
     ##########
     # Load data
     ##########
+    logger.info('========== Step1: Loading data ==========')
     if uploaded_file is not None:
         df = helpers.load_data(uploaded_file)
     else:
         df = helpers.load_data('data.csv')
     data = df.copy()
+    selected_cols = st.multiselect('Select columns to analyse', options=data.columns,
+                                   default=[col for col in data.columns if col.lower() in ['title', 'abstract']])
+    data = data[selected_cols]
+    data = data.dropna()
+    data = data.reset_index(drop=True)
     st.write(f'Number of papers: {len(data)}')
     st.write('First 5 rows of loaded data:')
+    st.write(data[selected_cols].head())
+    if (data is not None) and selected_cols:
+        # For 'allenai-specter'
+        data['Text'] = data[data.columns[0]]
+        for column in data.columns[1:]:
+            data['Text'] = data['Text'] + '[SEP]' + data[column].astype(str)
         ##########
         # Topic modeling
         ##########
+        logger.info('========== Step2: Topic modeling ==========')
         st.header('🔥 Topic Modeling')
         cols = st.columns(3)
         with cols[0]:
             min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
+                                       max_value=round(len(data)*0.25), step=1, value=min(round(len(data)/25), 10),
                                        help='The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics.')
         with cols[1]:
             n_gram_range = st.slider('N-gram range', key='n_gram_range', min_value=1,
+                                     max_value=3, step=1, value=(1, 2),
                                      help='N-gram range for the topic model')
         with cols[2]:
             st.text('')
             st.text('')
             st.button('Reset Defaults', on_click=helpers.reset_default_topic_sliders, key='reset_topic_sliders',
+                      kwargs={'min_topic_size': min(round(len(data)/25), 10), 'n_gram_range': (1, 2)})
         with st.spinner('Topic Modeling'):
+            topic_data, topic_model, topics = helpers.topic_modeling(
                 data, min_topic_size=min_topic_size, n_gram_range=n_gram_range)
             mapping = {
                 topic_model_vis_option = st.selectbox(
                     'Select Topic Modeling Visualization', mapping.keys())
             try:
+                fig = mapping[topic_model_vis_option](top_n_topics=10)
                 fig.update_layout(title='')
                 st.plotly_chart(fig, use_container_width=True)
             except:
         ##########
         # STriP Network
         ##########
+        logger.info('========== Step3: STriP Network ==========')
         st.header('🚀 STriP Network')
         with st.spinner('Cosine Similarity Calculation'):
             cosine_sim_matrix = helpers.cosine_sim(data)
+        value, min_value = helpers.calc_optimal_threshold(
             cosine_sim_matrix,
             # 25% is a good value for the number of papers
+            max_connections=min(
+                helpers.calc_max_connections(len(data), 0.25), 5_000
+            )
         )
         cols = st.columns(3)
         with st.spinner('Network Generation'):
             nx_net, pyvis_net = helpers.network_plot(
+                topic_data, topics, neighbors)
             # Save and read graph as HTML file (on Streamlit Sharing)
             try:
         ##########
         # Centrality
         ##########
+        logger.info('========== Step4: Network Centrality ==========')
         st.header('🏅 Most Important Papers')
         centrality_mapping = {
         # Calculate centrality
         centrality = centrality_mapping[centrality_option](nx_net)
+        cols = st.columns([1, 10, 1])
+        with cols[1]:
+            with st.spinner('Network Centrality Calculation'):
+                fig = helpers.network_centrality(
+                    topic_data, centrality, centrality_option)
+                st.plotly_chart(fig, use_container_width=True)
     st.markdown(
         """

helpers.py CHANGED Viewed

@@ -8,6 +8,10 @@ from sklearn.feature_extraction.text import CountVectorizer
 import pandas as pd
 import numpy as np
 import networkx as nx
 def reset_default_topic_sliders(min_topic_size, n_gram_range):
@@ -19,61 +23,60 @@ def reset_default_threshold_slider(threshold):
     st.session_state['threshold'] = threshold
-@st.cache(allow_output_mutation=True)
-def load_sbert_model():
-    return SentenceTransformer('allenai-specter')
 @st.cache()
 def load_data(uploaded_file):
     data = pd.read_csv(uploaded_file)
-    data = data[['Title', 'Abstract']]
-    data = data.dropna()
-    data = data.reset_index(drop=True)
     return data
-@st.cache(allow_output_mutation=True)
-def topic_modeling(data, min_topic_size, n_gram_range):
-    """Topic modeling using BERTopic
-    """
-    topic_model = BERTopic(
-        embedding_model=load_sbert_model(),
         vectorizer_model=CountVectorizer(
-            stop_words='english', ngram_range=n_gram_range),
-        min_topic_size=min_topic_size
     )
-    # For 'allenai-specter'
-    data['Title + Abstract'] = data['Title'] + '[SEP]' + data['Abstract']
     # Train the topic model
-    data["Topic"], data["Probs"] = topic_model.fit_transform(
-        data['Title + Abstract'])
     # Merge topic results
-    topic_df = topic_model.get_topic_info()[['Topic', 'Name']]
-    data = data.merge(topic_df, on='Topic', how='left')
     # Topics
-    topics = topic_df.set_index('Topic').to_dict(orient='index')
-    return data, topic_model, topics
-@st.cache(allow_output_mutation=True)
-def embeddings(data):
-    data['embedding'] = load_sbert_model().encode(
-        data['Title + Abstract']).tolist()
-    return data
 @st.cache()
 def cosine_sim(data):
-    cosine_sim_matrix = cosine_similarity(data['embedding'].values.tolist())
     # Take only upper triangular matrix
     cosine_sim_matrix = np.triu(cosine_sim_matrix, k=1)
@@ -93,10 +96,11 @@ def calc_optimal_threshold(cosine_sim_matrix, max_connections):
     """Calculates the optimal threshold for the cosine similarity matrix.
     Allows a max of max_connections
     """
-    thresh_sweep = np.arange(0.05, 1.05, 0.05)
     for idx, threshold in enumerate(thresh_sweep):
         neighbors = np.argwhere(cosine_sim_matrix >= threshold).tolist()
-        if len(neighbors) < max_connections:
             break
     return round(thresh_sweep[idx-1], 2).item(), round(thresh_sweep[idx], 2).item()
@@ -104,6 +108,7 @@ def calc_optimal_threshold(cosine_sim_matrix, max_connections):
 @st.cache()
 def calc_neighbors(cosine_sim_matrix, threshold):
     neighbors = np.argwhere(cosine_sim_matrix >= threshold).tolist()
     return neighbors, len(neighbors)
@@ -122,9 +127,10 @@ def pyvis_hash_func(pyvis_net):
 @st.cache(hash_funcs={nx.Graph: nx_hash_func, Network: pyvis_hash_func})
-def network_plot(data, topics, neighbors):
     """Creates a network plot of connected papers. Colored by Topic Model topics.
     """
     nx_net = nx.Graph()
     pyvis_net = Network(height='750px', width='100%', bgcolor='#222222')
@@ -135,14 +141,21 @@ def network_plot(data, topics, neighbors):
             {
                 'group': row.Topic,
                 'label': row.Index,
-                'title': row.Title,
                 'size': 20, 'font': {'size': 20, 'color': 'white'}
             }
         )
-        for row in data.itertuples()
     ]
     nx_net.add_nodes_from(nodes)
-    assert(nx_net.number_of_nodes() == len(data))
     # Add Legend Nodes
     step = 150
@@ -150,9 +163,9 @@ def network_plot(data, topics, neighbors):
     y = -500
     legend_nodes = [
         (
-            len(data)+idx,
             {
-                'group': key, 'label': ', '.join(value['Name'].split('_')[1:]),
                 'size': 30, 'physics': False, 'x': x, 'y': f'{y + idx*step}px',
                 # , 'fixed': True,
                 'shape': 'box', 'widthConstraint': 1000, 'font': {'size': 40, 'color': 'black'}
@@ -162,33 +175,47 @@ def network_plot(data, topics, neighbors):
     ]
     nx_net.add_nodes_from(legend_nodes)
-    # Add Edges
-    nx_net.add_edges_from(neighbors)
-    assert(nx_net.number_of_edges() == len(neighbors))
     # Plot the Pyvis graph
     pyvis_net.from_nx(nx_net)
     return nx_net, pyvis_net
 @st.cache()
-def network_centrality(data, centrality, centrality_option):
     """Calculates the centrality of the network
     """
     # Sort Top 10 Central nodes
     central_nodes = sorted(
         centrality.items(), key=lambda item: item[1], reverse=True)
     central_nodes = pd.DataFrame(central_nodes, columns=[
                                  'node', centrality_option]).set_index('node')
-    joined_data = data.join(central_nodes)
     top_central_nodes = joined_data.sort_values(
         centrality_option, ascending=False).head(10)
     # Plot the Top 10 Central nodes
-    fig = px.bar(top_central_nodes, x=centrality_option, y='Title')
-    fig.update_layout(yaxis={'categoryorder': 'total ascending'},
-                      font={'size': 15},
-                      height=800, width=800)
     return fig

 import pandas as pd
 import numpy as np
 import networkx as nx
+import textwrap
+import logging
+logger = logging.getLogger('main')
 def reset_default_topic_sliders(min_topic_size, n_gram_range):
     st.session_state['threshold'] = threshold
 @st.cache()
 def load_data(uploaded_file):
     data = pd.read_csv(uploaded_file)
     return data
+@st.cache()
+def embedding_gen(data):
+    logger.info('Calculating Embeddings')
+    return SentenceTransformer('allenai-specter').encode(data['Text'])
+@st.cache()
+def load_bertopic_model(min_topic_size, n_gram_range):
+    logger.info('Loading BERTopic model')
+    return BERTopic(
         vectorizer_model=CountVectorizer(
+            stop_words='english', ngram_range=n_gram_range
+        ),
+        min_topic_size=min_topic_size,
+        verbose=True
     )
+@st.cache()
+def topic_modeling(data, min_topic_size, n_gram_range):
+    """Topic modeling using BERTopic
+    """
+    logger.info('Calculating Topic Model')
+    topic_model = load_bertopic_model(min_topic_size, n_gram_range)
     # Train the topic model
+    topic_data = data.copy()
+    topic_data["Topic"], topic_data["Probs"] = topic_model.fit_transform(
+        data['Text'], embeddings=embedding_gen(data))
     # Merge topic results
+    topic_df = topic_model.get_topic_info()
+    topic_df.columns = ['Topic', 'Topic_Count', 'Topic_Name']
+    topic_df = topic_df.sort_values(by='Topic_Count', ascending=False)
+    topic_data = topic_data.merge(topic_df, on='Topic', how='left')
     # Topics
+    # Optimization: Only take top 10 largest topics
+    topics = topic_df.head(10).set_index('Topic').to_dict(orient='index')
+    return topic_data, topic_model, topics
 @st.cache()
 def cosine_sim(data):
+    logger.info('Cosine similarity')
+    cosine_sim_matrix = cosine_similarity(embedding_gen(data))
     # Take only upper triangular matrix
     cosine_sim_matrix = np.triu(cosine_sim_matrix, k=1)
     """Calculates the optimal threshold for the cosine similarity matrix.
     Allows a max of max_connections
     """
+    logger.info('Calculating optimal threshold')
+    thresh_sweep = np.arange(0.05, 1.05, 0.05)[::-1]
     for idx, threshold in enumerate(thresh_sweep):
         neighbors = np.argwhere(cosine_sim_matrix >= threshold).tolist()
+        if len(neighbors) > max_connections:
             break
     return round(thresh_sweep[idx-1], 2).item(), round(thresh_sweep[idx], 2).item()
 @st.cache()
 def calc_neighbors(cosine_sim_matrix, threshold):
+    logger.info('Calculating neighbors')
     neighbors = np.argwhere(cosine_sim_matrix >= threshold).tolist()
     return neighbors, len(neighbors)
 @st.cache(hash_funcs={nx.Graph: nx_hash_func, Network: pyvis_hash_func})
+def network_plot(topic_data, topics, neighbors):
     """Creates a network plot of connected papers. Colored by Topic Model topics.
     """
+    logger.info('Calculating Network Plot')
     nx_net = nx.Graph()
     pyvis_net = Network(height='750px', width='100%', bgcolor='#222222')
             {
                 'group': row.Topic,
                 'label': row.Index,
+                'title': row.Text,
                 'size': 20, 'font': {'size': 20, 'color': 'white'}
             }
         )
+        for row in topic_data.itertuples()
     ]
     nx_net.add_nodes_from(nodes)
+    assert(nx_net.number_of_nodes() == len(topic_data))
+    # Add Edges
+    nx_net.add_edges_from(neighbors)
+    assert(nx_net.number_of_edges() == len(neighbors))
+    # Optimization: Remove Isolated nodes
+    nx_net.remove_nodes_from(list(nx.isolates(nx_net)))
     # Add Legend Nodes
     step = 150
     y = -500
     legend_nodes = [
         (
+            len(topic_data)+idx,
             {
+                'group': key, 'label': ', '.join(value['Topic_Name'].split('_')[1:]),
                 'size': 30, 'physics': False, 'x': x, 'y': f'{y + idx*step}px',
                 # , 'fixed': True,
                 'shape': 'box', 'widthConstraint': 1000, 'font': {'size': 40, 'color': 'black'}
     ]
     nx_net.add_nodes_from(legend_nodes)
     # Plot the Pyvis graph
     pyvis_net.from_nx(nx_net)
     return nx_net, pyvis_net
+def text_processing(text):
+    text = text.split('[SEP]')
+    text = '<br><br>'.join(text)
+    text = '<br>'.join(textwrap.wrap(text, width=50))[:500]
+    text = text + '...'
+    return text
 @st.cache()
+def network_centrality(topic_data, centrality, centrality_option):
     """Calculates the centrality of the network
     """
+    logger.info('Calculating Network Centrality')
     # Sort Top 10 Central nodes
     central_nodes = sorted(
         centrality.items(), key=lambda item: item[1], reverse=True)
     central_nodes = pd.DataFrame(central_nodes, columns=[
                                  'node', centrality_option]).set_index('node')
+    joined_data = topic_data.join(central_nodes)
     top_central_nodes = joined_data.sort_values(
         centrality_option, ascending=False).head(10)
+    # Prepare for plot
+    top_central_nodes = top_central_nodes.reset_index()
+    top_central_nodes['index'] = top_central_nodes['index'].astype(str)
+    top_central_nodes['Topic_Name'] = top_central_nodes['Topic_Name'].apply(
+        lambda x: ', '.join(x.split('_')[1:]))
+    top_central_nodes['Text'] = top_central_nodes['Text'].apply(
+        text_processing)
     # Plot the Top 10 Central nodes
+    fig = px.bar(top_central_nodes, x=centrality_option, y='index',
+                 color='Topic_Name', hover_data=['Text'], orientation='h')
+    fig.update_layout(yaxis={'categoryorder': 'total ascending', 'visible': False, 'showticklabels': False},
+                      font={'size': 15}, height=800)
     return fig