Rezoon commited on
Commit
feefc9e
·
verified ·
1 Parent(s): f47cc14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -34
app.py CHANGED
@@ -3,8 +3,8 @@ import pandas as pd
3
  import re
4
  import nltk
5
  from nltk.corpus import stopwords
6
- from bertopic import BERTopic
7
- from sklearn.feature_extraction.text import CountVectorizer
8
 
9
  # Download NLTK stopwords (if not already downloaded)
10
  nltk.download('stopwords')
@@ -13,7 +13,7 @@ nltk.download('stopwords')
13
 
14
  def clean_text(text):
15
  """
16
- Clean the input text by lowercasing, removing non-alphabet characters,
17
  and extra spaces.
18
  """
19
  text = text.lower()
@@ -30,9 +30,22 @@ def remove_stopwords(text):
30
  filtered_tokens = [token for token in tokens if token not in stop_words]
31
  return " ".join(filtered_tokens)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # --- Streamlit App Interface ---
34
 
35
- st.title("NYT Comments Topic Modeling App")
36
  st.markdown(
37
  """
38
  Upload your CSV file containing NYT comments.
@@ -44,45 +57,51 @@ st.markdown(
44
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
45
 
46
  if uploaded_file is not None:
 
47
  df = pd.read_csv(uploaded_file)
48
  st.write("Columns in CSV:", df.columns.tolist())
49
-
50
- # If 'comment_body' is not present, let the user select a column to use
51
  if 'comment_body' not in df.columns:
52
  st.warning("Column 'comment_body' not found in CSV.")
53
- text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist())
54
  else:
55
  text_column = 'comment_body'
56
-
57
  st.markdown("### Preprocessing Comments")
58
  # Preprocess the selected column's text: clean and remove stopwords
59
  df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
60
  st.write("Sample processed comments:")
61
  st.write(df[[text_column, 'clean_comment']].head())
62
-
63
- # Prepare documents for topic modeling
64
- docs = df['clean_comment'].tolist()
65
-
66
- st.markdown("### Topic Modeling Settings")
67
- num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
68
-
69
- st.markdown("### Running Topic Modeling")
70
- # Create a CountVectorizer using English stopwords
71
- vectorizer_model = CountVectorizer(stop_words="english")
72
 
73
- # Initialize BERTopic with the vectorizer and approximate number of topics
74
- topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
75
- topics, probs = topic_model.fit_transform(docs)
76
- st.write("Topic modeling complete!")
77
-
78
- st.markdown("#### Topics Found")
79
- topic_info = topic_model.get_topic_info()
80
- st.write(topic_info)
81
-
82
- st.markdown("#### Topic Bar Chart")
83
- fig_bar = topic_model.visualize_barchart()
84
- st.plotly_chart(fig_bar)
85
-
86
- st.markdown("#### Topic Visualization (Scatter Plot)")
87
- fig_topics = topic_model.visualize_topics()
88
- st.plotly_chart(fig_topics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import re
4
  import nltk
5
  from nltk.corpus import stopwords
6
+ from transformers import pipeline
7
+ import matplotlib.pyplot as plt
8
 
9
  # Download NLTK stopwords (if not already downloaded)
10
  nltk.download('stopwords')
 
13
 
14
  def clean_text(text):
15
  """
16
+ Clean the input text by converting to lowercase, removing non-alphabet characters,
17
  and extra spaces.
18
  """
19
  text = text.lower()
 
30
  filtered_tokens = [token for token in tokens if token not in stop_words]
31
  return " ".join(filtered_tokens)
32
 
33
+ def analyze_emotion(text, classifier):
34
+ """
35
+ Use the Hugging Face pipeline to analyze emotion for the given text.
36
+ Truncates text to 512 characters if necessary.
37
+ Returns the label with the highest score.
38
+ """
39
+ # Truncate text to avoid long inputs if necessary
40
+ truncated = text[:512]
41
+ result = classifier(truncated)[0]
42
+ # result is a list of dictionaries with keys "label" and "score"
43
+ best = max(result, key=lambda x: x["score"])
44
+ return best["label"]
45
+
46
  # --- Streamlit App Interface ---
47
 
48
+ st.title("NYT Comments Emotion Analysis App")
49
  st.markdown(
50
  """
51
  Upload your CSV file containing NYT comments.
 
57
  uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
58
 
59
  if uploaded_file is not None:
60
+ # Load CSV file
61
  df = pd.read_csv(uploaded_file)
62
  st.write("Columns in CSV:", df.columns.tolist())
63
+
64
+ # Let user select a text column if 'comment_body' is not present
65
  if 'comment_body' not in df.columns:
66
  st.warning("Column 'comment_body' not found in CSV.")
67
+ text_column = st.selectbox("Select the column to use for emotion analysis", options=df.columns.tolist())
68
  else:
69
  text_column = 'comment_body'
70
+
71
  st.markdown("### Preprocessing Comments")
72
  # Preprocess the selected column's text: clean and remove stopwords
73
  df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
74
  st.write("Sample processed comments:")
75
  st.write(df[[text_column, 'clean_comment']].head())
 
 
 
 
 
 
 
 
 
 
76
 
77
+ st.markdown("### Running Emotion Analysis")
78
+ with st.spinner("Loading emotion analysis model..."):
79
+ # Initialize the Hugging Face emotion analysis pipeline
80
+ emotion_classifier = pipeline("text-classification",
81
+ model="j-hartmann/emotion-english-distilroberta-base",
82
+ return_all_scores=True)
83
+
84
+ # Analyze emotion for each comment
85
+ st.markdown("Analyzing emotions for each comment...")
86
+ # To avoid long waiting times, you can limit analysis to first N rows
87
+ N = st.number_input("Number of comments to analyze (set 0 for all)", min_value=0, value=0)
88
+ if N > 0:
89
+ df_subset = df.head(N)
90
+ else:
91
+ df_subset = df.copy()
92
+
93
+ # Apply emotion analysis to each cleaned comment
94
+ df_subset['emotion'] = df_subset['clean_comment'].apply(lambda x: analyze_emotion(x, emotion_classifier))
95
+
96
+ st.write("Sample emotion analysis results:")
97
+ st.write(df_subset[[text_column, 'clean_comment', 'emotion']].head())
98
+
99
+ # Aggregate emotion counts for visualization
100
+ emotion_counts = df_subset['emotion'].value_counts().reset_index()
101
+ emotion_counts.columns = ['emotion', 'count']
102
+
103
+ st.markdown("### Emotion Distribution")
104
+ st.bar_chart(emotion_counts.set_index('emotion'))
105
+
106
+ st.markdown("### Detailed Results")
107
+ st.write(df_subset)