Spaces:

Rezoon
/

TopicAnalysis

Sleeping

App Files Files Community

TopicAnalysis / app.py

Rezoon

Update app.py

feefc9e verified 4 months ago

raw

history blame contribute delete

3.86 kB

	import streamlit as st
	import pandas as pd
	import re
	import nltk
	from nltk.corpus import stopwords
	from transformers import pipeline
	import matplotlib.pyplot as plt

	# Download NLTK stopwords (if not already downloaded)
	nltk.download('stopwords')

	# --- Helper Functions ---

	def clean_text(text):
	"""
	Clean the input text by converting to lowercase, removing non-alphabet characters,
	and extra spaces.
	"""
	text = text.lower()
	text = re.sub(r'[^a-z\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def remove_stopwords(text):
	"""
	Remove common English stopwords from the text.
	"""
	stop_words = set(stopwords.words('english'))
	tokens = text.split()
	filtered_tokens = [token for token in tokens if token not in stop_words]
	return " ".join(filtered_tokens)

	def analyze_emotion(text, classifier):
	"""
	Use the Hugging Face pipeline to analyze emotion for the given text.
	Truncates text to 512 characters if necessary.
	Returns the label with the highest score.
	"""
	# Truncate text to avoid long inputs if necessary
	truncated = text[:512]
	result = classifier(truncated)[0]
	# result is a list of dictionaries with keys "label" and "score"
	best = max(result, key=lambda x: x["score"])
	return best["label"]

	# --- Streamlit App Interface ---

	st.title("NYT Comments Emotion Analysis App")
	st.markdown(
	"""
	Upload your CSV file containing NYT comments.
	If your CSV doesn’t have a column named comment_body, you can select which column to use.
	"""
	)

	# File uploader widget
	uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

	if uploaded_file is not None:
	# Load CSV file
	df = pd.read_csv(uploaded_file)
	st.write("Columns in CSV:", df.columns.tolist())

	# Let user select a text column if 'comment_body' is not present
	if 'comment_body' not in df.columns:
	st.warning("Column 'comment_body' not found in CSV.")
	text_column = st.selectbox("Select the column to use for emotion analysis", options=df.columns.tolist())
	else:
	text_column = 'comment_body'

	st.markdown("### Preprocessing Comments")
	# Preprocess the selected column's text: clean and remove stopwords
	df['clean_comment'] = df[text_column].astype(str).apply(clean_text).apply(remove_stopwords)
	st.write("Sample processed comments:")
	st.write(df[[text_column, 'clean_comment']].head())

	st.markdown("### Running Emotion Analysis")
	with st.spinner("Loading emotion analysis model..."):
	# Initialize the Hugging Face emotion analysis pipeline
	emotion_classifier = pipeline("text-classification",
	model="j-hartmann/emotion-english-distilroberta-base",
	return_all_scores=True)

	# Analyze emotion for each comment
	st.markdown("Analyzing emotions for each comment...")
	# To avoid long waiting times, you can limit analysis to first N rows
	N = st.number_input("Number of comments to analyze (set 0 for all)", min_value=0, value=0)
	if N > 0:
	df_subset = df.head(N)
	else:
	df_subset = df.copy()

	# Apply emotion analysis to each cleaned comment
	df_subset['emotion'] = df_subset['clean_comment'].apply(lambda x: analyze_emotion(x, emotion_classifier))

	st.write("Sample emotion analysis results:")
	st.write(df_subset[[text_column, 'clean_comment', 'emotion']].head())

	# Aggregate emotion counts for visualization
	emotion_counts = df_subset['emotion'].value_counts().reset_index()
	emotion_counts.columns = ['emotion', 'count']

	st.markdown("### Emotion Distribution")
	st.bar_chart(emotion_counts.set_index('emotion'))

	st.markdown("### Detailed Results")
	st.write(df_subset)