Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from transformers import pipeline | |
import matplotlib.pyplot as plt | |
# Download NLTK stopwords (if not already downloaded) | |
nltk.download('stopwords') | |
# --- Helper Functions --- | |
def clean_text(text): | |
""" | |
Clean the input text by converting to lowercase, removing non-alphabet characters, | |
and extra spaces. | |
""" | |
text = text.lower() | |
text = re.sub(r'[^a-z\s]', '', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def remove_stopwords(text): | |
""" | |
Remove common English stopwords from the text. | |
""" | |
stop_words = set(stopwords.words('english')) | |
tokens = text.split() | |
filtered_tokens = [token for token in tokens if token not in stop_words] | |
return " ".join(filtered_tokens) | |
def analyze_emotion(text, classifier): | |
""" | |
Use the Hugging Face pipeline to analyze emotion for the given text. | |
Truncates text to 512 characters if necessary. | |
Returns the label with the highest score. | |
""" | |
# Truncate text to avoid long inputs if necessary | |
truncated = text[:512] | |
result = classifier(truncated)[0] | |
# result is a list of dictionaries with keys "label" and "score" | |
best = max(result, key=lambda x: x["score"]) | |
return best["label"] | |
# --- Streamlit App Interface --- | |
st.title("NYT Comments Emotion Analysis App") | |
st.markdown( | |
""" | |
Upload your CSV file containing NYT comments. | |
If your CSV doesn’t have a column named **comment_body**, you can select which column to use. | |
""" | |
) | |
# File uploader widget | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
# Load CSV file | |
df = pd.read_csv(uploaded_file) | |
st.write("Columns in CSV:", df.columns.tolist()) | |
# Let user select a text column if 'comment_body' is not present | |
if 'comment_body' not in df.columns: | |
st.warning("Column 'comment_body' not found in CSV.") | |
text_column = st.selectbox("Select the column to use for emotion analysis", options=df.columns.tolist()) | |
else: | |
text_column = 'comment_body' | |
st.markdown("### Preprocessing Comments") | |
# Preprocess the selected column's text: clean and remove stopwords | |
df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords) | |
st.write("Sample processed comments:") | |
st.write(df[[text_column, 'clean_comment']].head()) | |
st.markdown("### Running Emotion Analysis") | |
with st.spinner("Loading emotion analysis model..."): | |
# Initialize the Hugging Face emotion analysis pipeline | |
emotion_classifier = pipeline("text-classification", | |
model="j-hartmann/emotion-english-distilroberta-base", | |
return_all_scores=True) | |
# Analyze emotion for each comment | |
st.markdown("Analyzing emotions for each comment...") | |
# To avoid long waiting times, you can limit analysis to first N rows | |
N = st.number_input("Number of comments to analyze (set 0 for all)", min_value=0, value=0) | |
if N > 0: | |
df_subset = df.head(N) | |
else: | |
df_subset = df.copy() | |
# Apply emotion analysis to each cleaned comment | |
df_subset['emotion'] = df_subset['clean_comment'].apply(lambda x: analyze_emotion(x, emotion_classifier)) | |
st.write("Sample emotion analysis results:") | |
st.write(df_subset[[text_column, 'clean_comment', 'emotion']].head()) | |
# Aggregate emotion counts for visualization | |
emotion_counts = df_subset['emotion'].value_counts().reset_index() | |
emotion_counts.columns = ['emotion', 'count'] | |
st.markdown("### Emotion Distribution") | |
st.bar_chart(emotion_counts.set_index('emotion')) | |
st.markdown("### Detailed Results") | |
st.write(df_subset) | |