Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from transformers import pipeline | |
| import matplotlib.pyplot as plt | |
| # Download NLTK stopwords (if not already downloaded) | |
| nltk.download('stopwords') | |
| # --- Helper Functions --- | |
| def clean_text(text): | |
| """ | |
| Clean the input text by converting to lowercase, removing non-alphabet characters, | |
| and extra spaces. | |
| """ | |
| text = text.lower() | |
| text = re.sub(r'[^a-z\s]', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def remove_stopwords(text): | |
| """ | |
| Remove common English stopwords from the text. | |
| """ | |
| stop_words = set(stopwords.words('english')) | |
| tokens = text.split() | |
| filtered_tokens = [token for token in tokens if token not in stop_words] | |
| return " ".join(filtered_tokens) | |
| def analyze_emotion(text, classifier): | |
| """ | |
| Use the Hugging Face pipeline to analyze emotion for the given text. | |
| Truncates text to 512 characters if necessary. | |
| Returns the label with the highest score. | |
| """ | |
| # Truncate text to avoid long inputs if necessary | |
| truncated = text[:512] | |
| result = classifier(truncated)[0] | |
| # result is a list of dictionaries with keys "label" and "score" | |
| best = max(result, key=lambda x: x["score"]) | |
| return best["label"] | |
| # --- Streamlit App Interface --- | |
| st.title("NYT Comments Emotion Analysis App") | |
| st.markdown( | |
| """ | |
| Upload your CSV file containing NYT comments. | |
| If your CSV doesn’t have a column named **comment_body**, you can select which column to use. | |
| """ | |
| ) | |
| # File uploader widget | |
| uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
| if uploaded_file is not None: | |
| # Load CSV file | |
| df = pd.read_csv(uploaded_file) | |
| st.write("Columns in CSV:", df.columns.tolist()) | |
| # Let user select a text column if 'comment_body' is not present | |
| if 'comment_body' not in df.columns: | |
| st.warning("Column 'comment_body' not found in CSV.") | |
| text_column = st.selectbox("Select the column to use for emotion analysis", options=df.columns.tolist()) | |
| else: | |
| text_column = 'comment_body' | |
| st.markdown("### Preprocessing Comments") | |
| # Preprocess the selected column's text: clean and remove stopwords | |
| df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords) | |
| st.write("Sample processed comments:") | |
| st.write(df[[text_column, 'clean_comment']].head()) | |
| st.markdown("### Running Emotion Analysis") | |
| with st.spinner("Loading emotion analysis model..."): | |
| # Initialize the Hugging Face emotion analysis pipeline | |
| emotion_classifier = pipeline("text-classification", | |
| model="j-hartmann/emotion-english-distilroberta-base", | |
| return_all_scores=True) | |
| # Analyze emotion for each comment | |
| st.markdown("Analyzing emotions for each comment...") | |
| # To avoid long waiting times, you can limit analysis to first N rows | |
| N = st.number_input("Number of comments to analyze (set 0 for all)", min_value=0, value=0) | |
| if N > 0: | |
| df_subset = df.head(N) | |
| else: | |
| df_subset = df.copy() | |
| # Apply emotion analysis to each cleaned comment | |
| df_subset['emotion'] = df_subset['clean_comment'].apply(lambda x: analyze_emotion(x, emotion_classifier)) | |
| st.write("Sample emotion analysis results:") | |
| st.write(df_subset[[text_column, 'clean_comment', 'emotion']].head()) | |
| # Aggregate emotion counts for visualization | |
| emotion_counts = df_subset['emotion'].value_counts().reset_index() | |
| emotion_counts.columns = ['emotion', 'count'] | |
| st.markdown("### Emotion Distribution") | |
| st.bar_chart(emotion_counts.set_index('emotion')) | |
| st.markdown("### Detailed Results") | |
| st.write(df_subset) | |