TopicAnalysis / app.py
Rezoon's picture
Update app.py
feefc9e verified
import streamlit as st
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from transformers import pipeline
import matplotlib.pyplot as plt
# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')
# --- Helper Functions ---
def clean_text(text):
"""
Clean the input text by converting to lowercase, removing non-alphabet characters,
and extra spaces.
"""
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def remove_stopwords(text):
"""
Remove common English stopwords from the text.
"""
stop_words = set(stopwords.words('english'))
tokens = text.split()
filtered_tokens = [token for token in tokens if token not in stop_words]
return " ".join(filtered_tokens)
def analyze_emotion(text, classifier):
"""
Use the Hugging Face pipeline to analyze emotion for the given text.
Truncates text to 512 characters if necessary.
Returns the label with the highest score.
"""
# Truncate text to avoid long inputs if necessary
truncated = text[:512]
result = classifier(truncated)[0]
# result is a list of dictionaries with keys "label" and "score"
best = max(result, key=lambda x: x["score"])
return best["label"]
# --- Streamlit App Interface ---
st.title("NYT Comments Emotion Analysis App")
st.markdown(
"""
Upload your CSV file containing NYT comments.
If your CSV doesn’t have a column named **comment_body**, you can select which column to use.
"""
)
# File uploader widget
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
if uploaded_file is not None:
# Load CSV file
df = pd.read_csv(uploaded_file)
st.write("Columns in CSV:", df.columns.tolist())
# Let user select a text column if 'comment_body' is not present
if 'comment_body' not in df.columns:
st.warning("Column 'comment_body' not found in CSV.")
text_column = st.selectbox("Select the column to use for emotion analysis", options=df.columns.tolist())
else:
text_column = 'comment_body'
st.markdown("### Preprocessing Comments")
# Preprocess the selected column's text: clean and remove stopwords
df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
st.write("Sample processed comments:")
st.write(df[[text_column, 'clean_comment']].head())
st.markdown("### Running Emotion Analysis")
with st.spinner("Loading emotion analysis model..."):
# Initialize the Hugging Face emotion analysis pipeline
emotion_classifier = pipeline("text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
return_all_scores=True)
# Analyze emotion for each comment
st.markdown("Analyzing emotions for each comment...")
# To avoid long waiting times, you can limit analysis to first N rows
N = st.number_input("Number of comments to analyze (set 0 for all)", min_value=0, value=0)
if N > 0:
df_subset = df.head(N)
else:
df_subset = df.copy()
# Apply emotion analysis to each cleaned comment
df_subset['emotion'] = df_subset['clean_comment'].apply(lambda x: analyze_emotion(x, emotion_classifier))
st.write("Sample emotion analysis results:")
st.write(df_subset[[text_column, 'clean_comment', 'emotion']].head())
# Aggregate emotion counts for visualization
emotion_counts = df_subset['emotion'].value_counts().reset_index()
emotion_counts.columns = ['emotion', 'count']
st.markdown("### Emotion Distribution")
st.bar_chart(emotion_counts.set_index('emotion'))
st.markdown("### Detailed Results")
st.write(df_subset)