import random import warnings import pandas as pd from PIL import Image import streamlit as st from wordcloud import WordCloud import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import os import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from utils.utils import get_top_ngram NLTK_DATA = os.getenv("NLTK_PATH") if not NLTK_DATA: NLTK_DATA = "/home/user/code/nltk_data" nltk.data.path.append(NLTK_DATA) warnings.filterwarnings("ignore") nltk.data.path.append(NLTK_DATA) try: nltk.data.find("corpora/stopwords.zip") except LookupError: nltk.download("stopwords") nltk.download("stopwords", download_dir=NLTK_DATA) try: nltk.data.find("tokenizers/punkt.zip") except LookupError: nltk.download("punkt", download_dir=NLTK_DATA) @st.cache_data def load_data(df): # df = pd.read_csv(file_path, dtype={'text': 'string', 'sentiment_label': 'category'}) df["createdAt"] = pd.to_datetime(df["createdAt"]) df["date"] = df["createdAt"].dt.strftime("%Y-%m-%d") return df @st.cache_data def process_texts(texts): custom_stopwords = set([ 'ser', 'haber', 'hacer', 'tener', 'poder', 'ir', 'q', 'si', 'solo', 'saber', 'decir', 'dar', 'querer', 'ver', 'así', 'sos', 'maje', 'dejar', 'si', 'solo', 'si', 'op', 'vos', 'cada', 'mismo', 'usted', 'mas', 'pues', 'andar', 'ahora', 'claro', 'nunca', 'quedar', 'pasar', 'venir', 'poner', 'dio', 'señora', 'señor', 'ahí', 'asi', 'vez', 'jajaja' ]) stop_words = set(stopwords.words("spanish")) stop_words.update(custom_stopwords) tokenized_texts = texts.apply(word_tokenize) tokenized_texts = tokenized_texts.apply( lambda x: [word.lower() for word in x if word.lower() not in stop_words] ) texts_cleaned = tokenized_texts.apply(lambda x: " ".join(x)) return texts_cleaned def custom_color_func(word, font_size, position, orientation, font_path, random_state): color_palette = ["#ff2b2b", "#83c9ff", "#0068c9"] return random.choice(color_palette) def display_word_cloud(dataframe): all_text = " ".join(dataframe["text"]) wordcloud = WordCloud( background_color="#fff", colormap="autumn", color_func=custom_color_func ).generate(all_text) wordcloud_image = wordcloud.to_array() fig = go.Figure() fig.add_layout_image( dict( source=Image.fromarray(wordcloud_image), x=0, y=1, sizex=1, sizey=1.3, opacity=1, ) ) fig.update_layout( autosize=False, height=170, width=500, margin=dict(l=0, r=0, t=0, b=0), xaxis=dict(visible=False), yaxis=dict(visible=False), ) return fig def most_common_trigrams(df, pdf=False): stop_words = set(stopwords.words("spanish")) # noqa: F841 colors = ["#ff2b2b", "#83c9ff", "#0068c9"] fig = make_subplots(rows=1, cols=3) sentiment_list = ["positive", "neutral", "negative"] sentiment_list2 = ["POS", "NEU", "NEG"] for i in range(3): texts = df[df["sentiment_label"] == sentiment_list2[i]]["text"] texts_cleaned = process_texts(texts) top_n_bigrams = get_top_ngram(texts_cleaned, 2)[:15] x, y = map(list, zip(*top_n_bigrams)) fig.add_trace( go.Bar( x=y, orientation="h", type="bar", name=sentiment_list[i].title(), marker=dict(color=colors[i]), text=x, textposition="inside", hovertemplate="%{text}: %{y}", ), 1, i + 1, ) fig.update_layout( autosize=False, margin=dict(t=0, b=0, l=0, r=0), height=250, ) return fig def display_target_count(df): colors = ["#83c9ff", "#ff2b2b", "#0068c9"] fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "bar"}]]) fig.add_trace( go.Pie( labels=df.sentiment_label.value_counts().index, values=df.sentiment_label.value_counts().values, ), 1, 1, ) fig.update_traces( hoverinfo="label+percent", textfont_size=18, marker=dict(colors=colors, line=dict(color="#fff", width=1)), ) fig.add_trace( go.Bar( x=df.sentiment_label.value_counts().index, y=df.sentiment_label.value_counts().values, marker_color=colors, ), 1, 2, ) fig.update_layout( title_text="Análisis de Sentimientos", title_y=1, title_font=dict(color="#808495", size=15), autosize=True, height=250, margin=dict(l=0, r=0, t=25, b=10), xaxis=dict(visible=False), yaxis=dict(visible=False), ) return fig def sentiment_over_date(df): df = load_data(df) grouped = df.groupby(["date", "sentiment_label"]).size().unstack(fill_value=0) fig = go.Figure() colors = ["#ff2b2b", "#83c9ff", "#0068c9"][::-1] for idx, sentiment_label in enumerate(grouped.columns): fig.add_trace( go.Scatter( x=grouped.index, y=grouped[sentiment_label], mode="lines", name=sentiment_label.capitalize(), stackgroup="one", line=dict(width=2, color=colors[idx]), fillcolor=colors[idx], hoverinfo="y+name", ) ) fig.update_xaxes(showgrid=False) fig.update_yaxes(showgrid=False) fig.update_layout( title={ "text": "Sentimiento a través del tiempo", "x": 0.2, "y": 1, "xanchor": "center", "yanchor": "top", "font": {"size": 15, "color": "#808495", "family": "Arial"}, }, xaxis_title="Fecha", yaxis_title="Conteo", hovermode="x", showlegend=True, autosize=False, height=250, width=500, margin=dict(l=0, r=0, t=40, b=0), plot_bgcolor="white", paper_bgcolor="white", ) return fig ############################################################################################################################## def crear_grafico_dispersion(df): fig = px.scatter( df, x="likeCount", y="sentiment_label", color="sentiment_label", labels={ "likeCount": "Número de Likes", "sentiment_label": "Etiqueta de Sentimiento", }, title="Relación entre Número de Likes y Etiquetas de Sentimiento", ) fig.update_layout( title_y=1, title_font=dict(color="#808495", size=15), autosize=True, height=250, margin=dict(l=0, r=0, t=20, b=0), # xaxis=dict(visible=False), # yaxis=dict(visible=False) ) return fig def bubble_fig(df): bubble_chart_data = ( df.groupby("account_creation_time").size().reset_index(name="user_count") ) bubble_fig = px.scatter( bubble_chart_data, x="account_creation_time", y="user_count", size="user_count", title="Tiempo de Creación de Cuenta
vs. Número de Usuarios", labels={ "account_creation_time": "Tiempo de Creación de Cuenta (meses)", "user_count": "Número de Usuarios", }, ) return bubble_fig def hist_fig(df): hist_fig = px.histogram( df, x="account_creation_time", title="Distribución del Tiempo de Creación de Cuenta", labels={ "account_creation_time": "Tiempo de Creación de Cuenta (meses)", "user_count": "Número de Usuarios", }, nbins=25, ) return hist_fig def stacked_bar_fig(df): stacked_bar_fig = px.histogram( df, x="account_creation_time", color="sentiment_label", title="Distribución del Tiempo de
Creación de Cuenta por Sentimiento de Comentario", labels={ "account_creation_time": "Tiempo de Creación de Cuenta (meses)", "count": "Número de Usuarios", "sentiment_beto": "Sentimiento", }, barmode="stack", nbins=25, ) return stacked_bar_fig def metrics_bar(tweet_data, df): st.write( """ """, unsafe_allow_html=True, ) avg_time = df["account_creation_time"].mean() min_time = df["account_creation_time"].min() max_time = df["account_creation_time"].max() left, right = st.columns([2, 1]) with left: with st.container(border=True): # st.write("###### Analysis of Time Metrics") col1, col2, col3 = st.columns(3) col1.metric("Tiempo Promedio", f"{round(avg_time/12)} años") col2.metric("### Tiempo Mínimo", f"{min_time} meses") col3.metric("Tiempo Máximo", f"{round(max_time/12)} años") with right: with st.container(border=True): # st.write("###### Sentiment Breakdown") pos, neu, neg = st.columns(3) # st.info(f"##### **Overall Sentiment**: :{TEXT_COLOR[tweet_data['overall_sentiment'].lower()]}[**{tweet_data['overall_sentiment']}**]") pos.metric(label=":green[Positive]", value=tweet_data["positive"]) neu.metric(label=":gray[Neutral]", value=tweet_data["neutral"]) neg.metric(label=":red[Negative]", value=tweet_data["negative"])