Rezoon commited on
Commit
f47cc14
·
verified ·
1 Parent(s): bcdd64b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -42
app.py CHANGED
@@ -6,14 +6,14 @@ from nltk.corpus import stopwords
6
  from bertopic import BERTopic
7
  from sklearn.feature_extraction.text import CountVectorizer
8
 
9
- # Download NLTK stopwords if not already downloaded
10
  nltk.download('stopwords')
11
 
12
  # --- Helper Functions ---
13
 
14
  def clean_text(text):
15
  """
16
- Clean the input text by lowering case, removing non-alphabetical characters,
17
  and extra spaces.
18
  """
19
  text = text.lower()
@@ -23,7 +23,7 @@ def clean_text(text):
23
 
24
  def remove_stopwords(text):
25
  """
26
- Remove English stopwords from the text.
27
  """
28
  stop_words = set(stopwords.words('english'))
29
  tokens = text.split()
@@ -35,8 +35,8 @@ def remove_stopwords(text):
35
  st.title("NYT Comments Topic Modeling App")
36
  st.markdown(
37
  """
38
- This app performs topic modeling on your CSV file containing NYT comments.
39
- **Important:** The CSV file must include a column named **comment_body**.
40
  """
41
  )
42
 
@@ -44,44 +44,45 @@ st.markdown(
44
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
45
 
46
  if uploaded_file is not None:
47
- # Read CSV file
48
  df = pd.read_csv(uploaded_file)
49
  st.write("Columns in CSV:", df.columns.tolist())
50
-
51
- # Verify that the CSV contains the expected column
52
  if 'comment_body' not in df.columns:
53
- st.error("The CSV file must contain a column named 'comment_body'.")
 
54
  else:
55
- st.markdown("### Preprocessing Comments")
56
- # Preprocess the text: clean and remove stopwords
57
- df['clean_comment'] = df['comment_body'].astype(str).apply(clean_text).apply(remove_stopwords)
58
- st.write("Sample processed comments:")
59
- st.write(df[['comment_body', 'clean_comment']].head())
60
-
61
- # Prepare documents for topic modeling
62
- docs = df['clean_comment'].tolist()
63
-
64
- st.markdown("### Topic Modeling Settings")
65
- # Let the user choose an approximate number of topics
66
- num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
67
-
68
- st.markdown("### Running Topic Modeling")
69
- # Create a CountVectorizer instance using English stopwords
70
- vectorizer_model = CountVectorizer(stop_words="english")
71
-
72
- # Initialize BERTopic with the vectorizer and approximate number of topics
73
- topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
74
- topics, probs = topic_model.fit_transform(docs)
75
- st.write("Topic modeling complete!")
76
-
77
- st.markdown("#### Topics Found")
78
- topic_info = topic_model.get_topic_info()
79
- st.write(topic_info)
80
-
81
- st.markdown("#### Topic Bar Chart")
82
- fig_bar = topic_model.visualize_barchart()
83
- st.plotly_chart(fig_bar)
84
-
85
- st.markdown("#### Topic Visualization (Scatter Plot)")
86
- fig_topics = topic_model.visualize_topics()
87
- st.plotly_chart(fig_topics)
 
 
6
  from bertopic import BERTopic
7
  from sklearn.feature_extraction.text import CountVectorizer
8
 
9
+ # Download NLTK stopwords (if not already downloaded)
10
  nltk.download('stopwords')
11
 
12
  # --- Helper Functions ---
13
 
14
  def clean_text(text):
15
  """
16
+ Clean the input text by lowercasing, removing non-alphabet characters,
17
  and extra spaces.
18
  """
19
  text = text.lower()
 
23
 
24
  def remove_stopwords(text):
25
  """
26
+ Remove common English stopwords from the text.
27
  """
28
  stop_words = set(stopwords.words('english'))
29
  tokens = text.split()
 
35
  st.title("NYT Comments Topic Modeling App")
36
  st.markdown(
37
  """
38
+ Upload your CSV file containing NYT comments.
39
+ If your CSV doesn’t have a column named **comment_body**, you can select which column to use.
40
  """
41
  )
42
 
 
44
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
45
 
46
  if uploaded_file is not None:
 
47
  df = pd.read_csv(uploaded_file)
48
  st.write("Columns in CSV:", df.columns.tolist())
49
+
50
+ # If 'comment_body' is not present, let the user select a column to use
51
  if 'comment_body' not in df.columns:
52
+ st.warning("Column 'comment_body' not found in CSV.")
53
+ text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist())
54
  else:
55
+ text_column = 'comment_body'
56
+
57
+ st.markdown("### Preprocessing Comments")
58
+ # Preprocess the selected column's text: clean and remove stopwords
59
+ df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
60
+ st.write("Sample processed comments:")
61
+ st.write(df[[text_column, 'clean_comment']].head())
62
+
63
+ # Prepare documents for topic modeling
64
+ docs = df['clean_comment'].tolist()
65
+
66
+ st.markdown("### Topic Modeling Settings")
67
+ num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
68
+
69
+ st.markdown("### Running Topic Modeling")
70
+ # Create a CountVectorizer using English stopwords
71
+ vectorizer_model = CountVectorizer(stop_words="english")
72
+
73
+ # Initialize BERTopic with the vectorizer and approximate number of topics
74
+ topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
75
+ topics, probs = topic_model.fit_transform(docs)
76
+ st.write("Topic modeling complete!")
77
+
78
+ st.markdown("#### Topics Found")
79
+ topic_info = topic_model.get_topic_info()
80
+ st.write(topic_info)
81
+
82
+ st.markdown("#### Topic Bar Chart")
83
+ fig_bar = topic_model.visualize_barchart()
84
+ st.plotly_chart(fig_bar)
85
+
86
+ st.markdown("#### Topic Visualization (Scatter Plot)")
87
+ fig_topics = topic_model.visualize_topics()
88
+ st.plotly_chart(fig_topics)