Spaces:

stephenleo
/

stripnet

Runtime error

App Files Files Community

stephenleo commited on Jan 1, 2022

Commit

d2b0a3c

•

1 Parent(s): 7475828

refactor and adding progress bar for emb gen

Browse files

Files changed (2) hide show

app.py +161 -122
helpers.py +59 -10

app.py CHANGED Viewed

@@ -18,167 +18,187 @@ logging.basicConfig(level=logging.INFO,
 logger = logging.getLogger('main')
-def main():
-    st.title('STriP (S3P): Semantic Similarity of Scientific Papers!')
     st.header('📂 Load Data')
     uploaded_file = st.file_uploader("Choose a CSV file",
                                      help='Upload a CSV file with the following columns: Title, Abstract')
-    ##########
-    # Load data
-    ##########
-    logger.info('========== Step1: Loading data ==========')
     if uploaded_file is not None:
         df = helpers.load_data(uploaded_file)
     else:
         df = helpers.load_data('data.csv')
     data = df.copy()
-    selected_cols = st.multiselect('Select columns to analyse', options=data.columns,
                                    default=[col for col in data.columns if col.lower() in ['title', 'abstract']])
     data = data[selected_cols]
     data = data.dropna()
     data = data.reset_index(drop=True)
     st.write(f'Number of rows: {len(data)}')
     if len(data) > 200:
         data = data.iloc[:200]
         st.write(f'Only first 200 rows will be analyzed')
     st.write('First 5 rows of loaded data:')
     st.write(data[selected_cols].head())
     if (data is not None) and selected_cols:
-        # For 'allenai-specter'
         data['Text'] = data[data.columns[0]]
         for column in data.columns[1:]:
             data['Text'] = data['Text'] + '[SEP]' + data[column].astype(str)
-        ##########
-        # Topic modeling
-        ##########
-        logger.info('========== Step2: Topic modeling ==========')
-        st.header('🔥 Topic Modeling')
         cols = st.columns(3)
         with cols[0]:
-            min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
-                                       max_value=min(round(len(data)*0.25), 100), step=1, value=min(round(len(data)/25), 10),
-                                       help='The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics.')
-        with cols[1]:
-            n_gram_range = st.slider('N-gram range', key='n_gram_range', min_value=1,
-                                     max_value=3, step=1, value=(1, 2),
-                                     help='N-gram range for the topic model')
-        with cols[2]:
-            st.text('')
-            st.text('')
-            st.button('Reset Defaults', on_click=helpers.reset_default_topic_sliders, key='reset_topic_sliders',
-                      kwargs={'min_topic_size': min(round(len(data)/25), 10), 'n_gram_range': (1, 2)})
-        with st.spinner('Topic Modeling'):
-            topic_data, topic_model, topics = helpers.topic_modeling(
-                data, min_topic_size=min_topic_size, n_gram_range=n_gram_range)
-            mapping = {
-                'Topic Keywords': topic_model.visualize_barchart,
-                'Topic Similarities': topic_model.visualize_heatmap,
-                'Topic Hierarchies': topic_model.visualize_hierarchy,
-                'Intertopic Distance': topic_model.visualize_topics
-            }
-            cols = st.columns(3)
-            with cols[0]:
-                topic_model_vis_option = st.selectbox(
-                    'Select Topic Modeling Visualization', mapping.keys())
-            try:
-                fig = mapping[topic_model_vis_option](top_n_topics=10)
-                fig.update_layout(title='')
-                st.plotly_chart(fig, use_container_width=True)
-            except:
-                st.warning(
-                    'No visualization available. Try a lower Minimum topic size!')
-        ##########
-        # STriP Network
-        ##########
-        logger.info('========== Step3: STriP Network ==========')
-        st.header('🚀 STriP Network')
-        with st.spinner('Cosine Similarity Calculation'):
-            cosine_sim_matrix = helpers.cosine_sim(data)
-        value, min_value = helpers.calc_optimal_threshold(
-            cosine_sim_matrix,
-            # 25% is a good value for the number of papers
-            max_connections=min(
-                helpers.calc_max_connections(len(data), 0.25), 5_000
-            )
         )
-        cols = st.columns(3)
-        with cols[0]:
-            threshold = st.slider('Cosine Similarity Threshold', key='threshold', min_value=min_value,
-                                  max_value=1.0, step=0.01, value=value,
-                                  help='The minimum cosine similarity between papers to draw a connection. Increasing this value will lead to a lesser connections.')
-            neighbors, num_connections = helpers.calc_neighbors(
-                cosine_sim_matrix, threshold)
-            st.write(f'Number of connections: {num_connections}')
-        with cols[1]:
-            st.text('')
-            st.text('')
-            st.button('Reset Defaults', on_click=helpers.reset_default_threshold_slider, key='reset_threshold',
-                      kwargs={'threshold': value})
-        with st.spinner('Network Generation'):
-            nx_net, pyvis_net = helpers.network_plot(
-                topic_data, topics, neighbors)
-            # Save and read graph as HTML file (on Streamlit Sharing)
-            try:
-                path = '/tmp'
-                pyvis_net.save_graph(f'{path}/pyvis_graph.html')
-                HtmlFile = open(f'{path}/pyvis_graph.html',
-                                'r', encoding='utf-8')
-            # Save and read graph as HTML file (locally)
-            except:
-                path = '/html_files'
-                pyvis_net.save_graph(f'{path}/pyvis_graph.html')
-                HtmlFile = open(f'{path}/pyvis_graph.html',
-                                'r', encoding='utf-8')
-            # Load HTML file in HTML component for display on Streamlit page
-            html(HtmlFile.read(), height=800)
-        ##########
-        # Centrality
-        ##########
-        logger.info('========== Step4: Network Centrality ==========')
-        st.header('🏅 Most Important Papers')
-        centrality_mapping = {
-            'Closeness Centrality': nx.closeness_centrality,
-            'Degree Centrality': nx.degree_centrality,
-            'Eigenvector Centrality': nx.eigenvector_centrality,
-            'Betweenness Centrality': nx.betweenness_centrality,
-        }
-        cols = st.columns(3)
-        with cols[0]:
-            centrality_option = st.selectbox(
-                'Select Centrality Measure', centrality_mapping.keys())
-        # Calculate centrality
-        centrality = centrality_mapping[centrality_option](nx_net)
-        cols = st.columns([1, 10, 1])
-        with cols[1]:
-            with st.spinner('Network Centrality Calculation'):
-                fig = helpers.network_centrality(
-                    topic_data, centrality, centrality_option)
-                st.plotly_chart(fig, use_container_width=True)
     st.markdown(
         """
         💡🔥🚀 STriP v1.0 🚀🔥💡
@@ -194,5 +214,24 @@ def main():
     )
 if __name__ == '__main__':
     main()

 logger = logging.getLogger('main')
+def load_data():
+    """Loads the data from the uploaded file.
+    """
     st.header('📂 Load Data')
     uploaded_file = st.file_uploader("Choose a CSV file",
                                      help='Upload a CSV file with the following columns: Title, Abstract')
     if uploaded_file is not None:
         df = helpers.load_data(uploaded_file)
     else:
         df = helpers.load_data('data.csv')
     data = df.copy()
+    # Column Selection. By default, any column called 'title' and 'abstract' are selected
+    st.subheader('Select columns to analyze')
+    selected_cols = st.multiselect(label='Select one or more columns. All the selected columns are concatenated before analyzing', options=data.columns,
                                    default=[col for col in data.columns if col.lower() in ['title', 'abstract']])
+    if not selected_cols:
+        st.error('No columns selected! Please select some text columns to analyze')
     data = data[selected_cols]
+    # Minor cleanup
     data = data.dropna()
     data = data.reset_index(drop=True)
+    # Load max 200 rows only
     st.write(f'Number of rows: {len(data)}')
     if len(data) > 200:
         data = data.iloc[:200]
         st.write(f'Only first 200 rows will be analyzed')
+    # Prints
     st.write('First 5 rows of loaded data:')
     st.write(data[selected_cols].head())
+    # Combine all selected columns
     if (data is not None) and selected_cols:
         data['Text'] = data[data.columns[0]]
         for column in data.columns[1:]:
             data['Text'] = data['Text'] + '[SEP]' + data[column].astype(str)
+    return data, selected_cols
+def topic_modeling(data):
+    """Runs the topic modeling step.
+    """
+    st.header('🔥 Topic Modeling')
+    cols = st.columns(3)
+    with cols[0]:
+        min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
+                                   max_value=min(round(len(data)*0.25), 100), step=1, value=min(round(len(data)/25), 10),
+                                   help='The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics.')
+    with cols[1]:
+        n_gram_range = st.slider('N-gram range', key='n_gram_range', min_value=1,
+                                 max_value=3, step=1, value=(1, 2),
+                                 help='N-gram range for the topic model')
+    with cols[2]:
+        st.text('')
+        st.text('')
+        st.button('Reset Defaults', on_click=helpers.reset_default_topic_sliders, key='reset_topic_sliders',
+                  kwargs={'min_topic_size': min(round(len(data)/25), 10), 'n_gram_range': (1, 2)})
+    with st.spinner('Topic Modeling'):
+        with helpers.st_stdout("success"), helpers.st_stderr("code"):
+            topic_data, topic_model, topics = helpers.topic_modeling(
+                data, min_topic_size=min_topic_size, n_gram_range=n_gram_range)
+        mapping = {
+            'Topic Keywords': topic_model.visualize_barchart,
+            'Topic Similarities': topic_model.visualize_heatmap,
+            'Topic Hierarchies': topic_model.visualize_hierarchy,
+            'Intertopic Distance': topic_model.visualize_topics
+        }
         cols = st.columns(3)
         with cols[0]:
+            topic_model_vis_option = st.selectbox(
+                'Select Topic Modeling Visualization', mapping.keys())
+        try:
+            fig = mapping[topic_model_vis_option](top_n_topics=10)
+            fig.update_layout(title='')
+            st.plotly_chart(fig, use_container_width=True)
+        except:
+            st.warning(
+                'No visualization available. Try a lower Minimum topic size!')
+    return topic_data, topics
+def strip_network(data, topic_data, topics):
+    """Generated the STriP network.
+    """
+    st.header('🚀 STriP Network')
+    with st.spinner('Cosine Similarity Calculation'):
+        cosine_sim_matrix = helpers.cosine_sim(data)
+    value, min_value = helpers.calc_optimal_threshold(
+        cosine_sim_matrix,
+        # 25% is a good value for the number of papers
+        max_connections=min(
+            helpers.calc_max_connections(len(data), 0.25), 5_000
         )
+    )
+    cols = st.columns(3)
+    with cols[0]:
+        threshold = st.slider('Cosine Similarity Threshold', key='threshold', min_value=min_value,
+                              max_value=1.0, step=0.01, value=value,
+                              help='The minimum cosine similarity between papers to draw a connection. Increasing this value will lead to a lesser connections.')
+        neighbors, num_connections = helpers.calc_neighbors(
+            cosine_sim_matrix, threshold)
+        st.write(f'Number of connections: {num_connections}')
+    with cols[1]:
+        st.text('')
+        st.text('')
+        st.button('Reset Defaults', on_click=helpers.reset_default_threshold_slider, key='reset_threshold',
+                  kwargs={'threshold': value})
+    with st.spinner('Network Generation'):
+        nx_net, pyvis_net = helpers.network_plot(
+            topic_data, topics, neighbors)
+        # Save and read graph as HTML file (on Streamlit Sharing)
+        try:
+            path = '/tmp'
+            pyvis_net.save_graph(f'{path}/pyvis_graph.html')
+            HtmlFile = open(f'{path}/pyvis_graph.html',
+                            'r', encoding='utf-8')
+        # Save and read graph as HTML file (locally)
+        except:
+            path = '/html_files'
+            pyvis_net.save_graph(f'{path}/pyvis_graph.html')
+            HtmlFile = open(f'{path}/pyvis_graph.html',
+                            'r', encoding='utf-8')
+        # Load HTML file in HTML component for display on Streamlit page
+        html(HtmlFile.read(), height=800)
+        return nx_net
+def network_centrality(nx_net, topic_data):
+    """Finds most important papers using network centrality measures.
+    """
+    st.header('🏅 Most Important Papers')
+    centrality_mapping = {
+        'Closeness Centrality': nx.closeness_centrality,
+        'Degree Centrality': nx.degree_centrality,
+        'Eigenvector Centrality': nx.eigenvector_centrality,
+        'Betweenness Centrality': nx.betweenness_centrality,
+    }
+    cols = st.columns(3)
+    with cols[0]:
+        centrality_option = st.selectbox(
+            'Select Centrality Measure', centrality_mapping.keys())
+    # Calculate centrality
+    centrality = centrality_mapping[centrality_option](nx_net)
+    cols = st.columns([1, 10, 1])
+    with cols[1]:
+        with st.spinner('Network Centrality Calculation'):
+            fig = helpers.network_centrality(
+                topic_data, centrality, centrality_option)
+            st.plotly_chart(fig, use_container_width=True)
+def about_me():
     st.markdown(
         """
         💡🔥🚀 STriP v1.0 🚀🔥💡
     )
+def main():
+    st.title('STriP (S3P): Semantic Similarity of Scientific Papers!')
+    logger.info('========== Step1: Loading data ==========')
+    data, selected_cols = load_data()
+    if (data is not None) and selected_cols:
+        logger.info('========== Step2: Topic modeling ==========')
+        topic_data, topics = topic_modeling(data)
+        logger.info('========== Step3: STriP Network ==========')
+        nx_net = strip_network(data, topic_data, topics)
+        logger.info('========== Step4: Network Centrality ==========')
+        network_centrality(nx_net, topic_data)
+    about_me()
 if __name__ == '__main__':
     main()

helpers.py CHANGED Viewed

@@ -11,6 +11,13 @@ import networkx as nx
 import textwrap
 import logging
 logger = logging.getLogger('main')
@@ -70,6 +77,8 @@ def topic_modeling(data, min_topic_size, n_gram_range):
     # Optimization: Only take top 10 largest topics
     topics = topic_df.head(10).set_index('Topic').to_dict(orient='index')
     return topic_data, topic_model, topics
@@ -91,6 +100,13 @@ def calc_max_connections(num_papers, ratio):
     return n*(n-1)/2
 @st.cache()
 def calc_optimal_threshold(cosine_sim_matrix, max_connections):
     """Calculates the optimal threshold for the cosine similarity matrix.
@@ -99,21 +115,13 @@ def calc_optimal_threshold(cosine_sim_matrix, max_connections):
     logger.info('Calculating optimal threshold')
     thresh_sweep = np.arange(0.05, 1.05, 0.05)[::-1]
     for idx, threshold in enumerate(thresh_sweep):
-        neighbors = np.argwhere(cosine_sim_matrix >= threshold).tolist()
-        if len(neighbors) > max_connections:
             break
     return round(thresh_sweep[idx-1], 2).item(), round(thresh_sweep[idx], 2).item()
-@st.cache()
-def calc_neighbors(cosine_sim_matrix, threshold):
-    logger.info('Calculating neighbors')
-    neighbors = np.argwhere(cosine_sim_matrix >= threshold).tolist()
-    return neighbors, len(neighbors)
 def nx_hash_func(nx_net):
     """Hash function for NetworkX graphs.
     """
@@ -219,3 +227,44 @@ def network_centrality(topic_data, centrality, centrality_option):
     fig.update_layout(yaxis={'categoryorder': 'total ascending', 'visible': False, 'showticklabels': False},
                       font={'size': 15}, height=800)
     return fig

 import textwrap
 import logging
+from streamlit.report_thread import REPORT_CONTEXT_ATTR_NAME
+from threading import current_thread
+from contextlib import contextmanager
+from io import StringIO
+import sys
+import time
 logger = logging.getLogger('main')
     # Optimization: Only take top 10 largest topics
     topics = topic_df.head(10).set_index('Topic').to_dict(orient='index')
+    logger.info('Topic Modeling Complete')
     return topic_data, topic_model, topics
     return n*(n-1)/2
+@st.cache()
+def calc_neighbors(cosine_sim_matrix, threshold):
+    neighbors = np.argwhere(cosine_sim_matrix >= threshold).tolist()
+    return neighbors, len(neighbors)
 @st.cache()
 def calc_optimal_threshold(cosine_sim_matrix, max_connections):
     """Calculates the optimal threshold for the cosine similarity matrix.
     logger.info('Calculating optimal threshold')
     thresh_sweep = np.arange(0.05, 1.05, 0.05)[::-1]
     for idx, threshold in enumerate(thresh_sweep):
+        _, num_neighbors = calc_neighbors(cosine_sim_matrix, threshold)
+        if num_neighbors > max_connections:
             break
     return round(thresh_sweep[idx-1], 2).item(), round(thresh_sweep[idx], 2).item()
 def nx_hash_func(nx_net):
     """Hash function for NetworkX graphs.
     """
     fig.update_layout(yaxis={'categoryorder': 'total ascending', 'visible': False, 'showticklabels': False},
                       font={'size': 15}, height=800)
     return fig
+# Progress bar printer
+# https://github.com/BugzTheBunny/streamlit_logging_output_example/blob/main/app.py
+# https://discuss.streamlit.io/t/cannot-print-the-terminal-output-in-streamlit/6602/34
+@contextmanager
+def st_redirect(src, dst):
+    placeholder = st.empty()
+    output_func = getattr(placeholder, dst)
+    with StringIO() as buffer:
+        old_write = src.write
+        def new_write(b):
+            if getattr(current_thread(), REPORT_CONTEXT_ATTR_NAME, None):
+                buffer.write(b)
+                time.sleep(1)
+                buffer.seek(0)  # returns pointer to 0 position
+                output_func(b)
+            else:
+                old_write(b)
+        try:
+            src.write = new_write
+            yield
+        finally:
+            src.write = old_write
+@contextmanager
+def st_stdout(dst):
+    "this will show the prints"
+    with st_redirect(sys.stdout, dst):
+        yield
+@contextmanager
+def st_stderr(dst):
+    "This will show the logging"
+    with st_redirect(sys.stderr, dst):
+        yield