Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from bertopic import BERTopic | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| # Download NLTK stopwords (if not already downloaded) | |
| nltk.download('stopwords') | |
| # --- Helper Functions --- | |
| def clean_text(text): | |
| """ | |
| Clean the input text by lowercasing, removing non-alphabet characters, | |
| and extra spaces. | |
| """ | |
| text = text.lower() | |
| text = re.sub(r'[^a-z\s]', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def remove_stopwords(text): | |
| """ | |
| Remove common English stopwords from the text. | |
| """ | |
| stop_words = set(stopwords.words('english')) | |
| tokens = text.split() | |
| filtered_tokens = [token for token in tokens if token not in stop_words] | |
| return " ".join(filtered_tokens) | |
| # --- Streamlit App Interface --- | |
| st.title("NYT Comments Topic Modeling App") | |
| st.markdown( | |
| """ | |
| Upload your CSV file containing NYT comments. | |
| If your CSV doesn’t have a column named **comment_body**, you can select which column to use. | |
| """ | |
| ) | |
| # File uploader widget | |
| uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
| if uploaded_file is not None: | |
| df = pd.read_csv(uploaded_file) | |
| st.write("Columns in CSV:", df.columns.tolist()) | |
| # If 'comment_body' is not present, let the user select a column to use | |
| if 'comment_body' not in df.columns: | |
| st.warning("Column 'comment_body' not found in CSV.") | |
| text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist()) | |
| else: | |
| text_column = 'comment_body' | |
| st.markdown("### Preprocessing Comments") | |
| # Preprocess the selected column's text: clean and remove stopwords | |
| df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords) | |
| st.write("Sample processed comments:") | |
| st.write(df[[text_column, 'clean_comment']].head()) | |
| # Prepare documents for topic modeling | |
| docs = df['clean_comment'].tolist() | |
| st.markdown("### Topic Modeling Settings") | |
| num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1) | |
| st.markdown("### Running Topic Modeling") | |
| # Create a CountVectorizer using English stopwords | |
| vectorizer_model = CountVectorizer(stop_words="english") | |
| # Initialize BERTopic with the vectorizer and approximate number of topics | |
| topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics) | |
| topics, probs = topic_model.fit_transform(docs) | |
| st.write("Topic modeling complete!") | |
| st.markdown("#### Topics Found") | |
| topic_info = topic_model.get_topic_info() | |
| st.write(topic_info) | |
| st.markdown("#### Topic Bar Chart") | |
| fig_bar = topic_model.visualize_barchart() | |
| st.plotly_chart(fig_bar) | |
| st.markdown("#### Topic Visualization (Scatter Plot)") | |
| fig_topics = topic_model.visualize_topics() | |
| st.plotly_chart(fig_topics) | |