|
import os |
|
import re |
|
import time |
|
import streamlit as st |
|
import matplotlib.pyplot as plt |
|
import pandas as pd |
|
import numpy as np |
|
import altair as alt |
|
import plotly.express as px |
|
|
|
from pysentimiento import create_analyzer |
|
from utils import load_data_pickle |
|
|
|
st.set_page_config(layout="wide") |
|
|
|
def clean_text(text): |
|
pattern_punct = r"[^\w\s.',:/]" |
|
pattern_date = r'\b\d{1,2}/\d{1,2}/\d{2,4}\b' |
|
|
|
text = text.lower() |
|
text = re.sub(pattern_date, '', text) |
|
text = re.sub(pattern_punct, '', text) |
|
text = text.replace("ggg","g") |
|
text = text.replace(" "," ") |
|
return text |
|
|
|
@st.cache_data(ttl=3600, show_spinner=False) |
|
def load_sa_model(): |
|
return create_analyzer(task="sentiment", lang="en") |
|
|
|
|
|
|
|
|
|
st.markdown("# Sentiment Analysis") |
|
|
|
st.markdown("### What is Sentiment Analysis ?") |
|
|
|
st.info(""" |
|
Sentiment analysis is a **Natural Language Processing** (NLP) task that involves determining the sentiment or emotion expressed in a piece of text. |
|
It has a wide range of use cases across various industries, as it helps organizations gain insights into the opinions, emotions, and attitudes expressed in text data.""") |
|
|
|
st.markdown("Here is an example of Sentiment analysis used to analyze **Customer Satisfaction** for perfums.") |
|
|
|
_, col, _ = st.columns([0.1,0.8,0.1]) |
|
with col: |
|
st.image("images/sentiment_analysis.png") |
|
|
|
st.markdown(" ") |
|
|
|
st.markdown(""" |
|
Common applications of Natural Language Processing include: |
|
- **Customer Feedback and Reviews** π―: Assessing reviews on products or services to understand customer satisfaction and identify areas for improvement. |
|
- **Market Research** π: Analyzing survey responses or online forums to gauge public opinion on products, services, or emerging trends. |
|
- **Financial Market Analysis** π: Monitoring financial news, reports, and social media to gauge investor sentiment and predict market trends. |
|
- **Government and Public Policy** π£: Analyzing public opinion on government policies, initiatives, and political decisions to gauge public sentiment and inform decision-making. |
|
""") |
|
|
|
st.divider() |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown("### Customer Reviews π") |
|
st.info("""In this use case, **sentiment analysis** is used to predict the **polarity** (negative, neutral, positive) of customer reviews. |
|
You can try the application by using the provided starbucks customer reviews, or by writing your own.""") |
|
st.markdown(" ") |
|
|
|
_, col, _ = st.columns([0.25,0.5,0.25]) |
|
with col: |
|
st.image("images/reviews.jpg") |
|
|
|
st.markdown(" ") |
|
|
|
|
|
|
|
path_sa = "data/sa_data" |
|
reviews_df = load_data_pickle(path_sa,"reviews_raw.pkl") |
|
reviews_df.reset_index(drop=True, inplace=True) |
|
reviews_df["Date"] = reviews_df["Date"].dt.date |
|
reviews_df["Year"] = reviews_df["Year"].astype(int) |
|
|
|
|
|
|
|
st.markdown("#### Predict polarity π€") |
|
tab1_, tab2_ = st.tabs(["Starbucks reviews", "Write a review"]) |
|
|
|
with tab1_: |
|
|
|
st.markdown(" ") |
|
|
|
col1, col2 = st.columns([0.2, 0.8], gap="medium") |
|
|
|
with col1: |
|
st.markdown("""<b>Filter reviews: </b> <br> |
|
You can filter the dataset by Date, State or Rating""", unsafe_allow_html=True) |
|
|
|
select_image_box = st.radio("", |
|
["Filter by Date (Year)", "Filter by State", "Filter by Rating", "No filters"], |
|
index=3, label_visibility="collapsed") |
|
|
|
if select_image_box == "Filter by Date (Year)": |
|
selected_date = st.multiselect("Date (Year)", reviews_df["Year"].unique(), default=reviews_df["Year"].unique()[0]) |
|
reviews_df = reviews_df.loc[reviews_df["Year"].isin(selected_date)] |
|
|
|
if select_image_box == "Filter by State": |
|
selected_state = st.multiselect("State", reviews_df["State"].unique(), default=reviews_df["State"].unique()[0]) |
|
reviews_df = reviews_df.loc[reviews_df["State"].isin(selected_state)] |
|
|
|
if select_image_box == "Filter by Rating": |
|
selected_rating = st.multiselect("Rating", sorted(list(reviews_df["Rating"].dropna().unique())), |
|
default = sorted(list(reviews_df["Rating"].dropna().unique()))[0]) |
|
reviews_df = reviews_df.loc[reviews_df["Rating"].isin(selected_rating)] |
|
|
|
if select_image_box == "No filters": |
|
pass |
|
|
|
|
|
run_model1 = st.button("**Run the model**", type="primary", key="tab1") |
|
st.info("The model has already been trained in this use case.") |
|
|
|
with col2: |
|
|
|
st.markdown("""<b>View the reviews:</b> <br> |
|
The dataset contains the location (State), date, rating, text and images (if provided) for each review.""", |
|
unsafe_allow_html=True) |
|
|
|
st.data_editor( |
|
reviews_df.drop(columns=["Year"]), |
|
column_config={"Image 1": st.column_config.ImageColumn("Image 1"), |
|
"Image 2": st.column_config.ImageColumn("Image 2")}, |
|
hide_index=True) |
|
|
|
|
|
|
|
if run_model1: |
|
with st.spinner('Wait for it...'): |
|
df_results = load_data_pickle(path_sa,"reviews_results.pkl") |
|
df_results.reset_index(drop=True, inplace=True) |
|
|
|
index_row = np.array(reviews_df.index) |
|
df_results = df_results.iloc[index_row].reset_index(drop=True) |
|
df_results["Review"] = reviews_df["Review"] |
|
st.markdown(" ") |
|
|
|
st.markdown("#### See the results βοΈ") |
|
tab1, tab2, tab3 = st.tabs(["All results", "Results per state", "Results per year"]) |
|
|
|
with tab1: |
|
|
|
df_results_tab1 = df_results[["ID","Review","Rating","Negative","Neutral","Positive","Result"]] |
|
|
|
|
|
df_warning = df_results_tab1["Result"].value_counts().to_frame().reset_index() |
|
df_warning["Percentage"] = (100*df_warning["count"]/df_warning["count"].sum()).round(2) |
|
|
|
perct_negative = df_warning.loc[df_warning["Result"]=="Negative","Percentage"].to_numpy()[0] |
|
if perct_negative > 50: |
|
st.error(f"**Negative reviews alert** β οΈ: The proportion of negative reviews is {perct_negative}% !") |
|
|
|
|
|
st.data_editor( |
|
df_results_tab1, |
|
column_config={ |
|
"Negative": st.column_config.ProgressColumn( |
|
"Negative π", |
|
help="Negative score of the review", |
|
format="%d%%", |
|
min_value=0, |
|
max_value=100), |
|
"Neutral": st.column_config.ProgressColumn( |
|
"Neutral β", |
|
help="Neutral score of the review", |
|
format="%d%%", |
|
min_value=0, |
|
max_value=100), |
|
"Positive": st.column_config.ProgressColumn( |
|
"Positive π", |
|
help="Positive score of the review", |
|
format="%d%%", |
|
min_value=0, |
|
max_value=100)}, |
|
hide_index=True, |
|
) |
|
|
|
with tab2: |
|
avg_state = df_results[["State","Negative","Neutral","Positive"]].groupby(["State"]).mean().round() |
|
avg_state = avg_state.reset_index().melt(id_vars="State", var_name="Sentiment", value_name="Score (%)") |
|
|
|
chart_state = alt.Chart(avg_state, title="Review polarity per state").mark_bar().encode( |
|
x=alt.X('Sentiment', axis=alt.Axis(title=None, labels=False, ticks=False)), |
|
y=alt.Y('Score (%)', axis=alt.Axis(grid=False)), |
|
color='Sentiment', |
|
column=alt.Column('State', header=alt.Header(title=None, labelOrient='bottom')) |
|
).configure_view( |
|
stroke='transparent' |
|
).interactive() |
|
|
|
st.markdown(" ") |
|
st.altair_chart(chart_state) |
|
|
|
|
|
with tab3: |
|
avg_year = df_results[["Year","Negative","Neutral","Positive"]] |
|
|
|
avg_year = avg_year.groupby(["Year"]).mean().round() |
|
avg_year = avg_year.reset_index().melt(id_vars="Year", var_name="Sentiment", value_name="Score (%)") |
|
|
|
chart_year = alt.Chart(avg_year, title="Evolution of review polarity").mark_area(opacity=0.5).encode( |
|
x='Year', |
|
y='Score (%)', |
|
color='Sentiment', |
|
).interactive() |
|
|
|
st.markdown(" ") |
|
st.altair_chart(chart_year, use_container_width=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab2_: |
|
st.markdown("**Write your own review**") |
|
|
|
txt_review = st.text_area( |
|
"Write your review", |
|
"I recently visited a local Starbucks, and unfortunately, my experience was far from satisfactory. " |
|
"From the moment I stepped in, the atmosphere felt chaotic and disorganized. " |
|
"The staff appeared overwhelmed, leading to a significant delay in receiving my order. " |
|
"The quality of my drink further added to my disappointment. " |
|
"The coffee tasted burnt, as if it had been sitting on the burner for far too long.", |
|
label_visibility="collapsed" |
|
) |
|
|
|
run_model2 = st.button("**Run the model**", type="primary", key="tab2") |
|
|
|
if run_model2: |
|
with st.spinner('Wait for it...'): |
|
|
|
|
|
sentiment_analyzer = load_sa_model() |
|
q = sentiment_analyzer.predict(txt_review) |
|
|
|
df_review_user = pd.DataFrame({"Polarity":["Positive","Neutral","Negative"], |
|
"Score":[q.probas['POS'], q.probas['NEU'], q.probas['NEG']]}) |
|
|
|
st.markdown(" ") |
|
st.info(f"""Your review was **{int(q.probas['POS']*100)}%** positive, **{int(q.probas['NEU']*100)}%** neutral |
|
and **{int(q.probas['NEG']*100)}%** negative.""") |
|
|
|
fig = px.bar(df_review_user, x='Score', y='Polarity', color="Polarity", title='Sentiment analysis results', orientation="h") |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
|