Kurkur99's picture
Update eda.py
298c4f8
raw
history blame contribute delete
No virus
1.89 kB
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
def label_sentiment(rating):
"""Label sentiment based on the rating."""
if rating in [1, 2]:
return 'negative'
elif rating == 3:
return 'neutral'
elif rating in [4, 5]:
return 'positive'
else:
return 'unknown'
def process_review(review):
"""Simple processing for the review text."""
review = review.lower()
review = re.sub(r'[^a-z\s]', '', review) # Remove non-alphabetical characters
return review
def display_eda(data):
# Derive the 'sentiment' column from 'rating' if it doesn't exist
if 'sentiment' not in data.columns:
if 'rating' not in data.columns:
st.error("The dataset does not contain a 'rating' or 'sentiment' column. Please check the data source.")
return
else:
data['sentiment'] = data['rating'].apply(label_sentiment)
# Distribution of sentiments
st.subheader("Distribution of Sentiments")
sentiment_counts = data['sentiment'].value_counts()
fig, ax = plt.subplots()
sentiment_counts.plot(kind='bar', ax=ax)
ax.set_title('Distribution of Sentiments')
ax.set_xlabel('Sentiment')
ax.set_ylabel('Count')
st.pyplot(fig)
# Word cloud for each sentiment
st.subheader("Word Clouds for Sentiments")
sentiments = data['sentiment'].unique()
for sentiment in sentiments:
st.write(f"Word Cloud for {sentiment}")
subset = data[data['sentiment'] == sentiment]
text = " ".join(process_review(review) for review in subset['review_description'])
wordcloud = WordCloud(max_words=100, background_color="white").generate(text)
fig = plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
st.pyplot(fig)