TopicAnalysis / app.py
Rezoon's picture
Update app.py
f47cc14 verified
raw
history blame
3 kB
import streamlit as st
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')
# --- Helper Functions ---
def clean_text(text):
"""
Clean the input text by lowercasing, removing non-alphabet characters,
and extra spaces.
"""
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def remove_stopwords(text):
"""
Remove common English stopwords from the text.
"""
stop_words = set(stopwords.words('english'))
tokens = text.split()
filtered_tokens = [token for token in tokens if token not in stop_words]
return " ".join(filtered_tokens)
# --- Streamlit App Interface ---
st.title("NYT Comments Topic Modeling App")
st.markdown(
"""
Upload your CSV file containing NYT comments.
If your CSV doesn’t have a column named **comment_body**, you can select which column to use.
"""
)
# File uploader widget
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
st.write("Columns in CSV:", df.columns.tolist())
# If 'comment_body' is not present, let the user select a column to use
if 'comment_body' not in df.columns:
st.warning("Column 'comment_body' not found in CSV.")
text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist())
else:
text_column = 'comment_body'
st.markdown("### Preprocessing Comments")
# Preprocess the selected column's text: clean and remove stopwords
df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
st.write("Sample processed comments:")
st.write(df[[text_column, 'clean_comment']].head())
# Prepare documents for topic modeling
docs = df['clean_comment'].tolist()
st.markdown("### Topic Modeling Settings")
num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
st.markdown("### Running Topic Modeling")
# Create a CountVectorizer using English stopwords
vectorizer_model = CountVectorizer(stop_words="english")
# Initialize BERTopic with the vectorizer and approximate number of topics
topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
topics, probs = topic_model.fit_transform(docs)
st.write("Topic modeling complete!")
st.markdown("#### Topics Found")
topic_info = topic_model.get_topic_info()
st.write(topic_info)
st.markdown("#### Topic Bar Chart")
fig_bar = topic_model.visualize_barchart()
st.plotly_chart(fig_bar)
st.markdown("#### Topic Visualization (Scatter Plot)")
fig_topics = topic_model.visualize_topics()
st.plotly_chart(fig_topics)