Spaces:

Rezoon
/

TopicAnalysis

Sleeping

App Files Files Community

Rezoon commited on Feb 26

Commit

feefc9e

verified ·

1 Parent(s): f47cc14

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -34

app.py CHANGED Viewed

@@ -3,8 +3,8 @@ import pandas as pd
 import re
 import nltk
 from nltk.corpus import stopwords
-from bertopic import BERTopic
-from sklearn.feature_extraction.text import CountVectorizer
 # Download NLTK stopwords (if not already downloaded)
 nltk.download('stopwords')
@@ -13,7 +13,7 @@ nltk.download('stopwords')
 def clean_text(text):
     """
-    Clean the input text by lowercasing, removing non-alphabet characters,
     and extra spaces.
     """
     text = text.lower()
@@ -30,9 +30,22 @@ def remove_stopwords(text):
     filtered_tokens = [token for token in tokens if token not in stop_words]
     return " ".join(filtered_tokens)
 # --- Streamlit App Interface ---
-st.title("NYT Comments Topic Modeling App")
 st.markdown(
     """
     Upload your CSV file containing NYT comments.
@@ -44,45 +57,51 @@ st.markdown(
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
 if uploaded_file is not None:
     df = pd.read_csv(uploaded_file)
     st.write("Columns in CSV:", df.columns.tolist())
-    # If 'comment_body' is not present, let the user select a column to use
     if 'comment_body' not in df.columns:
         st.warning("Column 'comment_body' not found in CSV.")
-        text_column = st.selectbox("Select the column to use for topic modeling", options=df.columns.tolist())
     else:
         text_column = 'comment_body'
     st.markdown("### Preprocessing Comments")
     # Preprocess the selected column's text: clean and remove stopwords
     df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
     st.write("Sample processed comments:")
     st.write(df[[text_column, 'clean_comment']].head())
-    # Prepare documents for topic modeling
-    docs = df['clean_comment'].tolist()
-    st.markdown("### Topic Modeling Settings")
-    num_topics = st.slider("Select approximate number of topics", min_value=2, max_value=20, value=5, step=1)
-    st.markdown("### Running Topic Modeling")
-    # Create a CountVectorizer using English stopwords
-    vectorizer_model = CountVectorizer(stop_words="english")
-    # Initialize BERTopic with the vectorizer and approximate number of topics
-    topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=num_topics)
-    topics, probs = topic_model.fit_transform(docs)
-    st.write("Topic modeling complete!")
-    st.markdown("#### Topics Found")
-    topic_info = topic_model.get_topic_info()
-    st.write(topic_info)
-    st.markdown("#### Topic Bar Chart")
-    fig_bar = topic_model.visualize_barchart()
-    st.plotly_chart(fig_bar)
-    st.markdown("#### Topic Visualization (Scatter Plot)")
-    fig_topics = topic_model.visualize_topics()
-    st.plotly_chart(fig_topics)

 import re
 import nltk
 from nltk.corpus import stopwords
+from transformers import pipeline
+import matplotlib.pyplot as plt
 # Download NLTK stopwords (if not already downloaded)
 nltk.download('stopwords')
 def clean_text(text):
     """
+    Clean the input text by converting to lowercase, removing non-alphabet characters,
     and extra spaces.
     """
     text = text.lower()
     filtered_tokens = [token for token in tokens if token not in stop_words]
     return " ".join(filtered_tokens)
+def analyze_emotion(text, classifier):
+    """
+    Use the Hugging Face pipeline to analyze emotion for the given text.
+    Truncates text to 512 characters if necessary.
+    Returns the label with the highest score.
+    """
+    # Truncate text to avoid long inputs if necessary
+    truncated = text[:512]
+    result = classifier(truncated)[0]
+    # result is a list of dictionaries with keys "label" and "score"
+    best = max(result, key=lambda x: x["score"])
+    return best["label"]
 # --- Streamlit App Interface ---
+st.title("NYT Comments Emotion Analysis App")
 st.markdown(
     """
     Upload your CSV file containing NYT comments.
 uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
 if uploaded_file is not None:
+    # Load CSV file
     df = pd.read_csv(uploaded_file)
     st.write("Columns in CSV:", df.columns.tolist())
+    # Let user select a text column if 'comment_body' is not present
     if 'comment_body' not in df.columns:
         st.warning("Column 'comment_body' not found in CSV.")
+        text_column = st.selectbox("Select the column to use for emotion analysis", options=df.columns.tolist())
     else:
         text_column = 'comment_body'
     st.markdown("### Preprocessing Comments")
     # Preprocess the selected column's text: clean and remove stopwords
     df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
     st.write("Sample processed comments:")
     st.write(df[[text_column, 'clean_comment']].head())
+    st.markdown("### Running Emotion Analysis")
+    with st.spinner("Loading emotion analysis model..."):
+        # Initialize the Hugging Face emotion analysis pipeline
+        emotion_classifier = pipeline("text-classification",
+                                      model="j-hartmann/emotion-english-distilroberta-base",
+                                      return_all_scores=True)
+    # Analyze emotion for each comment
+    st.markdown("Analyzing emotions for each comment...")
+    # To avoid long waiting times, you can limit analysis to first N rows
+    N = st.number_input("Number of comments to analyze (set 0 for all)", min_value=0, value=0)
+    if N > 0:
+        df_subset = df.head(N)
+    else:
+        df_subset = df.copy()
+    # Apply emotion analysis to each cleaned comment
+    df_subset['emotion'] = df_subset['clean_comment'].apply(lambda x: analyze_emotion(x, emotion_classifier))
+    st.write("Sample emotion analysis results:")
+    st.write(df_subset[[text_column, 'clean_comment', 'emotion']].head())
+    # Aggregate emotion counts for visualization
+    emotion_counts = df_subset['emotion'].value_counts().reset_index()
+    emotion_counts.columns = ['emotion', 'count']
+    st.markdown("### Emotion Distribution")
+    st.bar_chart(emotion_counts.set_index('emotion'))
+    st.markdown("### Detailed Results")
+    st.write(df_subset)