import random
import warnings
import pandas as pd
from PIL import Image
import streamlit as st
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from utils.utils import get_top_ngram
NLTK_DATA = os.getenv("NLTK_PATH")
if not NLTK_DATA:
NLTK_DATA = "/home/user/code/nltk_data"
nltk.data.path.append(NLTK_DATA)
warnings.filterwarnings("ignore")
nltk.data.path.append(NLTK_DATA)
try:
nltk.data.find("corpora/stopwords.zip")
except LookupError:
nltk.download("stopwords")
nltk.download("stopwords", download_dir=NLTK_DATA)
try:
nltk.data.find("tokenizers/punkt.zip")
except LookupError:
nltk.download("punkt", download_dir=NLTK_DATA)
@st.cache_data
def load_data(df):
# df = pd.read_csv(file_path, dtype={'text': 'string', 'sentiment_label': 'category'})
df["createdAt"] = pd.to_datetime(df["createdAt"])
df["date"] = df["createdAt"].dt.strftime("%Y-%m-%d")
return df
@st.cache_data
def process_texts(texts):
custom_stopwords = set([
'ser', 'haber', 'hacer', 'tener', 'poder', 'ir', 'q', 'si', 'solo', 'saber', 'decir',
'dar', 'querer', 'ver', 'así', 'sos', 'maje', 'dejar', 'si', 'solo', 'si', 'op', 'vos',
'cada', 'mismo', 'usted', 'mas', 'pues', 'andar', 'ahora', 'claro', 'nunca', 'quedar', 'pasar',
'venir', 'poner', 'dio', 'señora', 'señor', 'ahí', 'asi', 'vez', 'jajaja'
])
stop_words = set(stopwords.words("spanish"))
stop_words.update(custom_stopwords)
tokenized_texts = texts.apply(word_tokenize)
tokenized_texts = tokenized_texts.apply(
lambda x: [word.lower() for word in x if word.lower() not in stop_words]
)
texts_cleaned = tokenized_texts.apply(lambda x: " ".join(x))
return texts_cleaned
def custom_color_func(word, font_size, position, orientation, font_path, random_state):
color_palette = ["#ff2b2b", "#83c9ff", "#0068c9"]
return random.choice(color_palette)
def display_word_cloud(dataframe):
all_text = " ".join(dataframe["text"])
wordcloud = WordCloud(
background_color="#fff", colormap="autumn", color_func=custom_color_func
).generate(all_text)
wordcloud_image = wordcloud.to_array()
fig = go.Figure()
fig.add_layout_image(
dict(
source=Image.fromarray(wordcloud_image),
x=0,
y=1,
sizex=1,
sizey=1.3,
opacity=1,
)
)
fig.update_layout(
autosize=False,
height=170,
width=500,
margin=dict(l=0, r=0, t=0, b=0),
xaxis=dict(visible=False),
yaxis=dict(visible=False),
)
return fig
def most_common_trigrams(df, pdf=False):
stop_words = set(stopwords.words("spanish")) # noqa: F841
colors = ["#ff2b2b", "#83c9ff", "#0068c9"]
fig = make_subplots(rows=1, cols=3)
sentiment_list = ["positive", "neutral", "negative"]
sentiment_list2 = ["POS", "NEU", "NEG"]
for i in range(3):
texts = df[df["sentiment_label"] == sentiment_list2[i]]["text"]
texts_cleaned = process_texts(texts)
top_n_bigrams = get_top_ngram(texts_cleaned, 2)[:15]
x, y = map(list, zip(*top_n_bigrams))
fig.add_trace(
go.Bar(
x=y,
orientation="h",
type="bar",
name=sentiment_list[i].title(),
marker=dict(color=colors[i]),
text=x,
textposition="inside",
hovertemplate="%{text}: %{y}",
),
1,
i + 1,
)
fig.update_layout(
autosize=False,
margin=dict(t=0, b=0, l=0, r=0),
height=250,
)
return fig
def display_target_count(df):
colors = ["#83c9ff", "#ff2b2b", "#0068c9"]
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "bar"}]])
fig.add_trace(
go.Pie(
labels=df.sentiment_label.value_counts().index,
values=df.sentiment_label.value_counts().values,
),
1,
1,
)
fig.update_traces(
hoverinfo="label+percent",
textfont_size=18,
marker=dict(colors=colors, line=dict(color="#fff", width=1)),
)
fig.add_trace(
go.Bar(
x=df.sentiment_label.value_counts().index,
y=df.sentiment_label.value_counts().values,
marker_color=colors,
),
1,
2,
)
fig.update_layout(
title_text="Análisis de Sentimientos",
title_y=1,
title_font=dict(color="#808495", size=15),
autosize=True,
height=250,
margin=dict(l=0, r=0, t=25, b=10),
xaxis=dict(visible=False),
yaxis=dict(visible=False),
)
return fig
def sentiment_over_date(df):
df = load_data(df)
grouped = df.groupby(["date", "sentiment_label"]).size().unstack(fill_value=0)
fig = go.Figure()
colors = ["#ff2b2b", "#83c9ff", "#0068c9"][::-1]
for idx, sentiment_label in enumerate(grouped.columns):
fig.add_trace(
go.Scatter(
x=grouped.index,
y=grouped[sentiment_label],
mode="lines",
name=sentiment_label.capitalize(),
stackgroup="one",
line=dict(width=2, color=colors[idx]),
fillcolor=colors[idx],
hoverinfo="y+name",
)
)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_layout(
title={
"text": "Sentimiento a través del tiempo",
"x": 0.2,
"y": 1,
"xanchor": "center",
"yanchor": "top",
"font": {"size": 15, "color": "#808495", "family": "Arial"},
},
xaxis_title="Fecha",
yaxis_title="Conteo",
hovermode="x",
showlegend=True,
autosize=False,
height=250,
width=500,
margin=dict(l=0, r=0, t=40, b=0),
plot_bgcolor="white",
paper_bgcolor="white",
)
return fig
##############################################################################################################################
def crear_grafico_dispersion(df):
fig = px.scatter(
df,
x="likeCount",
y="sentiment_label",
color="sentiment_label",
labels={
"likeCount": "Número de Likes",
"sentiment_label": "Etiqueta de Sentimiento",
},
title="Relación entre Número de Likes y Etiquetas de Sentimiento",
)
fig.update_layout(
title_y=1,
title_font=dict(color="#808495", size=15),
autosize=True,
height=250,
margin=dict(l=0, r=0, t=20, b=0),
# xaxis=dict(visible=False),
# yaxis=dict(visible=False)
)
return fig
def bubble_fig(df):
bubble_chart_data = (
df.groupby("account_creation_time").size().reset_index(name="user_count")
)
bubble_fig = px.scatter(
bubble_chart_data,
x="account_creation_time",
y="user_count",
size="user_count",
title="Tiempo de Creación de Cuenta
vs. Número de Usuarios",
labels={
"account_creation_time": "Tiempo de Creación de Cuenta (meses)",
"user_count": "Número de Usuarios",
},
)
return bubble_fig
def hist_fig(df):
hist_fig = px.histogram(
df,
x="account_creation_time",
title="Distribución del Tiempo de Creación de Cuenta",
labels={
"account_creation_time": "Tiempo de Creación de Cuenta (meses)",
"user_count": "Número de Usuarios",
},
nbins=25,
)
return hist_fig
def stacked_bar_fig(df):
stacked_bar_fig = px.histogram(
df,
x="account_creation_time",
color="sentiment_label",
title="Distribución del Tiempo de
Creación de Cuenta por Sentimiento de Comentario",
labels={
"account_creation_time": "Tiempo de Creación de Cuenta (meses)",
"count": "Número de Usuarios",
"sentiment_beto": "Sentimiento",
},
barmode="stack",
nbins=25,
)
return stacked_bar_fig
def metrics_bar(tweet_data, df):
st.write(
"""
""",
unsafe_allow_html=True,
)
avg_time = df["account_creation_time"].mean()
min_time = df["account_creation_time"].min()
max_time = df["account_creation_time"].max()
left, right = st.columns([2, 1])
with left:
with st.container(border=True):
# st.write("###### Analysis of Time Metrics")
col1, col2, col3 = st.columns(3)
col1.metric("Tiempo Promedio", f"{round(avg_time/12)} años")
col2.metric("### Tiempo Mínimo", f"{min_time} meses")
col3.metric("Tiempo Máximo", f"{round(max_time/12)} años")
with right:
with st.container(border=True):
# st.write("###### Sentiment Breakdown")
pos, neu, neg = st.columns(3)
# st.info(f"##### **Overall Sentiment**: :{TEXT_COLOR[tweet_data['overall_sentiment'].lower()]}[**{tweet_data['overall_sentiment']}**]")
pos.metric(label=":green[Positive]", value=tweet_data["positive"])
neu.metric(label=":gray[Neutral]", value=tweet_data["neutral"])
neg.metric(label=":red[Negative]", value=tweet_data["negative"])