Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,14 +6,14 @@ from nltk.corpus import stopwords
|
|
| 6 |
from bertopic import BERTopic
|
| 7 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 8 |
|
| 9 |
-
# Download NLTK stopwords if not already downloaded
|
| 10 |
nltk.download('stopwords')
|
| 11 |
|
| 12 |
# --- Helper Functions ---
|
| 13 |
|
| 14 |
def clean_text(text):
|
| 15 |
"""
|
| 16 |
-
Clean the input text by
|
| 17 |
and extra spaces.
|
| 18 |
"""
|
| 19 |
text = text.lower()
|
|
@@ -23,7 +23,7 @@ def clean_text(text):
|
|
| 23 |
|
| 24 |
def remove_stopwords(text):
|
| 25 |
"""
|
| 26 |
-
Remove English stopwords from the text.
|
| 27 |
"""
|
| 28 |
stop_words = set(stopwords.words('english'))
|
| 29 |
tokens = text.split()
|
|
@@ -35,8 +35,8 @@ def remove_stopwords(text):
|
|
| 35 |
st.title("NYT Comments Topic Modeling App")
|
| 36 |
st.markdown(
|
| 37 |
"""
|
| 38 |
-
|
| 39 |
-
|
| 40 |
"""
|
| 41 |
)
|
| 42 |
|
|
@@ -44,44 +44,45 @@ st.markdown(
|
|
| 44 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
| 45 |
|
| 46 |
if uploaded_file is not None:
|
| 47 |
-
# Read CSV file
|
| 48 |
df = pd.read_csv(uploaded_file)
|
| 49 |
st.write("Columns in CSV:", df.columns.tolist())
|
| 50 |
-
|
| 51 |
-
#
|
| 52 |
if 'comment_body' not in df.columns:
|
| 53 |
-
st.
|
|
|
|
| 54 |
else:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
| 6 |
from bertopic import BERTopic
|
| 7 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 8 |
|
| 9 |
+
# Download NLTK stopwords (if not already downloaded)
|
| 10 |
nltk.download('stopwords')
|
| 11 |
|
| 12 |
# --- Helper Functions ---
|
| 13 |
|
| 14 |
def clean_text(text):
|
| 15 |
"""
|
| 16 |
+
Clean the input text by lowercasing, removing non-alphabet characters,
|
| 17 |
and extra spaces.
|
| 18 |
"""
|
| 19 |
text = text.lower()
|
|
|
|
| 23 |
|
| 24 |
def remove_stopwords(text):
|
| 25 |
"""
|
| 26 |
+
Remove common English stopwords from the text.
|
| 27 |
"""
|
| 28 |
stop_words = set(stopwords.words('english'))
|
| 29 |
tokens = text.split()
|
|
|
|
| 35 |
st.title("NYT Comments Topic Modeling App")
|
| 36 |
st.markdown(
|
| 37 |
"""
|
| 38 |
+
Upload your CSV file containing NYT comments.
|
| 39 |
+
If your CSV doesn’t have a column named **comment_body**, you can select which column to use.
|
| 40 |
"""
|
| 41 |
)
|
| 42 |
|
|
|
|
| 44 |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
| 45 |
|
| 46 |
if uploaded_file is not None:
|
|
|
|
| 47 |
df = pd.read_csv(uploaded_file)
|
| 48 |
st.write("Columns in CSV:", df.columns.tolist())
|
| 49 |
+
|
| 50 |
+
# If 'comment_body' is not present, let the user select a column to use
|
| 51 |
if 'comment_body' not in df.columns:
|
| 52 |
+
st.warning("Column 'comment_body' not found in CSV.")
|
| 53 |
+
text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist())
|
| 54 |
else:
|
| 55 |
+
text_column = 'comment_body'
|
| 56 |
+
|
| 57 |
+
st.markdown("### Preprocessing Comments")
|
| 58 |
+
# Preprocess the selected column's text: clean and remove stopwords
|
| 59 |
+
df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
|
| 60 |
+
st.write("Sample processed comments:")
|
| 61 |
+
st.write(df[[text_column, 'clean_comment']].head())
|
| 62 |
+
|
| 63 |
+
# Prepare documents for topic modeling
|
| 64 |
+
docs = df['clean_comment'].tolist()
|
| 65 |
+
|
| 66 |
+
st.markdown("### Topic Modeling Settings")
|
| 67 |
+
num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
|
| 68 |
+
|
| 69 |
+
st.markdown("### Running Topic Modeling")
|
| 70 |
+
# Create a CountVectorizer using English stopwords
|
| 71 |
+
vectorizer_model = CountVectorizer(stop_words="english")
|
| 72 |
+
|
| 73 |
+
# Initialize BERTopic with the vectorizer and approximate number of topics
|
| 74 |
+
topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
|
| 75 |
+
topics, probs = topic_model.fit_transform(docs)
|
| 76 |
+
st.write("Topic modeling complete!")
|
| 77 |
+
|
| 78 |
+
st.markdown("#### Topics Found")
|
| 79 |
+
topic_info = topic_model.get_topic_info()
|
| 80 |
+
st.write(topic_info)
|
| 81 |
+
|
| 82 |
+
st.markdown("#### Topic Bar Chart")
|
| 83 |
+
fig_bar = topic_model.visualize_barchart()
|
| 84 |
+
st.plotly_chart(fig_bar)
|
| 85 |
+
|
| 86 |
+
st.markdown("#### Topic Visualization (Scatter Plot)")
|
| 87 |
+
fig_topics = topic_model.visualize_topics()
|
| 88 |
+
st.plotly_chart(fig_topics)
|