Spaces:

Rezoon
/

TopicAnalysis

Sleeping

App Files Files Community

Rezoon commited on Feb 26

Commit

f47cc14

verified ·

1 Parent(s): bcdd64b

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -42

app.py CHANGED Viewed

@@ -6,14 +6,14 @@ from nltk.corpus import stopwords
 from bertopic import BERTopic
 from sklearn.feature_extraction.text import CountVectorizer
-# Download NLTK stopwords if not already downloaded
 nltk.download('stopwords')
 # --- Helper Functions ---
 def clean_text(text):
     """
-    Clean the input text by lowering case, removing non-alphabetical characters,
     and extra spaces.
     """
     text = text.lower()
@@ -23,7 +23,7 @@ def clean_text(text):
 def remove_stopwords(text):
     """
-    Remove English stopwords from the text.
     """
     stop_words = set(stopwords.words('english'))
     tokens = text.split()
@@ -35,8 +35,8 @@ def remove_stopwords(text):
 st.title("NYT Comments Topic Modeling App")
 st.markdown(
     """
-    This app performs topic modeling on your CSV file containing NYT comments.
-    **Important:** The CSV file must include a column named **comment_body**.
     """
 )
@@ -44,44 +44,45 @@ st.markdown(
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
 if uploaded_file is not None:
-    # Read CSV file
     df = pd.read_csv(uploaded_file)
     st.write("Columns in CSV:", df.columns.tolist())
-    # Verify that the CSV contains the expected column
     if 'comment_body' not in df.columns:
-        st.error("The CSV file must contain a column named 'comment_body'.")
     else:
-        st.markdown("### Preprocessing Comments")
-        # Preprocess the text: clean and remove stopwords
-        df['clean_comment'] = df['comment_body'].astype(str).apply(clean_text).apply(remove_stopwords)
-        st.write("Sample processed comments:")
-        st.write(df[['comment_body', 'clean_comment']].head())
-        # Prepare documents for topic modeling
-        docs = df['clean_comment'].tolist()
-        st.markdown("### Topic Modeling Settings")
-        # Let the user choose an approximate number of topics
-        num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
-        st.markdown("### Running Topic Modeling")
-        # Create a CountVectorizer instance using English stopwords
-        vectorizer_model = CountVectorizer(stop_words="english")
-        # Initialize BERTopic with the vectorizer and approximate number of topics
-        topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
-        topics, probs = topic_model.fit_transform(docs)
-        st.write("Topic modeling complete!")
-        st.markdown("#### Topics Found")
-        topic_info = topic_model.get_topic_info()
-        st.write(topic_info)
-        st.markdown("#### Topic Bar Chart")
-        fig_bar = topic_model.visualize_barchart()
-        st.plotly_chart(fig_bar)
-        st.markdown("#### Topic Visualization (Scatter Plot)")
-        fig_topics = topic_model.visualize_topics()
-        st.plotly_chart(fig_topics)

 from bertopic import BERTopic
 from sklearn.feature_extraction.text import CountVectorizer
+# Download NLTK stopwords (if not already downloaded)
 nltk.download('stopwords')
 # --- Helper Functions ---
 def clean_text(text):
     """
+    Clean the input text by lowercasing, removing non-alphabet characters,
     and extra spaces.
     """
     text = text.lower()
 def remove_stopwords(text):
     """
+    Remove common English stopwords from the text.
     """
     stop_words = set(stopwords.words('english'))
     tokens = text.split()
 st.title("NYT Comments Topic Modeling App")
 st.markdown(
     """
+    Upload your CSV file containing NYT comments.
+    If your CSV doesn’t have a column named **comment_body**, you can select which column to use.
     """
 )
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
 if uploaded_file is not None:
     df = pd.read_csv(uploaded_file)
     st.write("Columns in CSV:", df.columns.tolist())
+    # If 'comment_body' is not present, let the user select a column to use
     if 'comment_body' not in df.columns:
+        st.warning("Column 'comment_body' not found in CSV.")
+        text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist())
     else:
+        text_column = 'comment_body'
+    st.markdown("### Preprocessing Comments")
+    # Preprocess the selected column's text: clean and remove stopwords
+    df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
+    st.write("Sample processed comments:")
+    st.write(df[[text_column, 'clean_comment']].head())
+    # Prepare documents for topic modeling
+    docs = df['clean_comment'].tolist()
+    st.markdown("### Topic Modeling Settings")
+    num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
+    st.markdown("### Running Topic Modeling")
+    # Create a CountVectorizer using English stopwords
+    vectorizer_model = CountVectorizer(stop_words="english")
+    # Initialize BERTopic with the vectorizer and approximate number of topics
+    topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
+    topics, probs = topic_model.fit_transform(docs)
+    st.write("Topic modeling complete!")
+    st.markdown("#### Topics Found")
+    topic_info = topic_model.get_topic_info()
+    st.write(topic_info)
+    st.markdown("#### Topic Bar Chart")
+    fig_bar = topic_model.visualize_barchart()
+    st.plotly_chart(fig_bar)
+    st.markdown("#### Topic Visualization (Scatter Plot)")
+    fig_topics = topic_model.visualize_topics()
+    st.plotly_chart(fig_topics)