Spaces:

Rezoon
/

TopicAnalysis

Sleeping

App Files Files Community

TopicAnalysis / app.py

Rezoon

Update app.py

f47cc14 verified 9 months ago

raw

history blame

3 kB

	import streamlit as st
	import pandas as pd
	import re
	import nltk
	from nltk.corpus import stopwords
	from bertopic import BERTopic
	from sklearn.feature_extraction.text import CountVectorizer

	# Download NLTK stopwords (if not already downloaded)
	nltk.download('stopwords')

	# --- Helper Functions ---

	def clean_text(text):
	"""
	Clean the input text by lowercasing, removing non-alphabet characters,
	and extra spaces.
	"""
	text = text.lower()
	text = re.sub(r'[^a-z\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def remove_stopwords(text):
	"""
	Remove common English stopwords from the text.
	"""
	stop_words = set(stopwords.words('english'))
	tokens = text.split()
	filtered_tokens = [token for token in tokens if token not in stop_words]
	return " ".join(filtered_tokens)

	# --- Streamlit App Interface ---

	st.title("NYT Comments Topic Modeling App")
	st.markdown(
	"""
	Upload your CSV file containing NYT comments.
	If your CSV doesn’t have a column named comment_body, you can select which column to use.
	"""
	)

	# File uploader widget
	uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)
	st.write("Columns in CSV:", df.columns.tolist())

	# If 'comment_body' is not present, let the user select a column to use
	if 'comment_body' not in df.columns:
	st.warning("Column 'comment_body' not found in CSV.")
	text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist())
	else:
	text_column = 'comment_body'

	st.markdown("### Preprocessing Comments")
	# Preprocess the selected column's text: clean and remove stopwords
	df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
	st.write("Sample processed comments:")
	st.write(df[[text_column, 'clean_comment']].head())

	# Prepare documents for topic modeling
	docs = df['clean_comment'].tolist()

	st.markdown("### Topic Modeling Settings")
	num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)

	st.markdown("### Running Topic Modeling")
	# Create a CountVectorizer using English stopwords
	vectorizer_model = CountVectorizer(stop_words="english")

	# Initialize BERTopic with the vectorizer and approximate number of topics
	topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
	topics, probs = topic_model.fit_transform(docs)
	st.write("Topic modeling complete!")

	st.markdown("#### Topics Found")
	topic_info = topic_model.get_topic_info()
	st.write(topic_info)

	st.markdown("#### Topic Bar Chart")
	fig_bar = topic_model.visualize_barchart()
	st.plotly_chart(fig_bar)

	st.markdown("#### Topic Visualization (Scatter Plot)")
	fig_topics = topic_model.visualize_topics()
	st.plotly_chart(fig_topics)