Spaces:

arpy8
/

Omdena-IREX-Sentiment-Analysis

Sleeping

App Files Files Community

Omdena-IREX-Sentiment-Analysis / utils /graph_functions.py

arpy8

update analyse and graph functions

f3b949b 12 months ago

raw

history blame contribute delete

9.81 kB

	import random
	import warnings
	import pandas as pd
	from PIL import Image
	import streamlit as st
	from wordcloud import WordCloud
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import os
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from utils.utils import get_top_ngram

	NLTK_DATA = os.getenv("NLTK_PATH")

	if not NLTK_DATA:
	NLTK_DATA = "/home/user/code/nltk_data"

	nltk.data.path.append(NLTK_DATA)

	warnings.filterwarnings("ignore")

	nltk.data.path.append(NLTK_DATA)
	try:
	nltk.data.find("corpora/stopwords.zip")
	except LookupError:
	nltk.download("stopwords")
	nltk.download("stopwords", download_dir=NLTK_DATA)

	try:
	nltk.data.find("tokenizers/punkt.zip")
	except LookupError:
	nltk.download("punkt", download_dir=NLTK_DATA)


	@st.cache_data
	def load_data(df):
	# df = pd.read_csv(file_path, dtype={'text': 'string', 'sentiment_label': 'category'})
	df["createdAt"] = pd.to_datetime(df["createdAt"])
	df["date"] = df["createdAt"].dt.strftime("%Y-%m-%d")
	return df


	@st.cache_data
	def process_texts(texts):
	custom_stopwords = set([
	'ser', 'haber', 'hacer', 'tener', 'poder', 'ir', 'q', 'si', 'solo', 'saber', 'decir',
	'dar', 'querer', 'ver', 'así', 'sos', 'maje', 'dejar', 'si', 'solo', 'si', 'op', 'vos',
	'cada', 'mismo', 'usted', 'mas', 'pues', 'andar', 'ahora', 'claro', 'nunca', 'quedar', 'pasar',
	'venir', 'poner', 'dio', 'señora', 'señor', 'ahí', 'asi', 'vez', 'jajaja'
	])
	stop_words = set(stopwords.words("spanish"))
	stop_words.update(custom_stopwords)
	tokenized_texts = texts.apply(word_tokenize)
	tokenized_texts = tokenized_texts.apply(
	lambda x: [word.lower() for word in x if word.lower() not in stop_words]
	)
	texts_cleaned = tokenized_texts.apply(lambda x: " ".join(x))
	return texts_cleaned


	def custom_color_func(word, font_size, position, orientation, font_path, random_state):
	color_palette = ["#ff2b2b", "#83c9ff", "#0068c9"]
	return random.choice(color_palette)


	def display_word_cloud(dataframe):
	all_text = " ".join(dataframe["text"])
	wordcloud = WordCloud(
	background_color="#fff", colormap="autumn", color_func=custom_color_func
	).generate(all_text)
	wordcloud_image = wordcloud.to_array()

	fig = go.Figure()
	fig.add_layout_image(
	dict(
	source=Image.fromarray(wordcloud_image),
	x=0,
	y=1,
	sizex=1,
	sizey=1.3,
	opacity=1,
	)
	)
	fig.update_layout(
	autosize=False,
	height=170,
	width=500,
	margin=dict(l=0, r=0, t=0, b=0),
	xaxis=dict(visible=False),
	yaxis=dict(visible=False),
	)

	return fig


	def most_common_trigrams(df, pdf=False):
	stop_words = set(stopwords.words("spanish")) # noqa: F841

	colors = ["#ff2b2b", "#83c9ff", "#0068c9"]
	fig = make_subplots(rows=1, cols=3)

	sentiment_list = ["positive", "neutral", "negative"]
	sentiment_list2 = ["POS", "NEU", "NEG"]

	for i in range(3):
	texts = df[df["sentiment_label"] == sentiment_list2[i]]["text"]
	texts_cleaned = process_texts(texts)

	top_n_bigrams = get_top_ngram(texts_cleaned, 2)[:15]
	x, y = map(list, zip(*top_n_bigrams))

	fig.add_trace(
	go.Bar(
	x=y,
	orientation="h",
	type="bar",
	name=sentiment_list[i].title(),
	marker=dict(color=colors[i]),
	text=x,
	textposition="inside",
	hovertemplate="%{text}: %{y}",
	),
	1,
	i + 1,
	)

	fig.update_layout(
	autosize=False,
	margin=dict(t=0, b=0, l=0, r=0),
	height=250,
	)

	return fig


	def display_target_count(df):
	colors = ["#83c9ff", "#ff2b2b", "#0068c9"]
	fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "bar"}]])
	fig.add_trace(
	go.Pie(
	labels=df.sentiment_label.value_counts().index,
	values=df.sentiment_label.value_counts().values,
	),
	1,
	1,
	)
	fig.update_traces(
	hoverinfo="label+percent",
	textfont_size=18,
	marker=dict(colors=colors, line=dict(color="#fff", width=1)),
	)
	fig.add_trace(
	go.Bar(
	x=df.sentiment_label.value_counts().index,
	y=df.sentiment_label.value_counts().values,
	marker_color=colors,
	),
	1,
	2,
	)
	fig.update_layout(
	title_text="Análisis de Sentimientos",
	title_y=1,
	title_font=dict(color="#808495", size=15),
	autosize=True,
	height=250,
	margin=dict(l=0, r=0, t=25, b=10),
	xaxis=dict(visible=False),
	yaxis=dict(visible=False),
	)

	return fig


	def sentiment_over_date(df):
	df = load_data(df)
	grouped = df.groupby(["date", "sentiment_label"]).size().unstack(fill_value=0)

	fig = go.Figure()

	colors = ["#ff2b2b", "#83c9ff", "#0068c9"][::-1]
	for idx, sentiment_label in enumerate(grouped.columns):
	fig.add_trace(
	go.Scatter(
	x=grouped.index,
	y=grouped[sentiment_label],
	mode="lines",
	name=sentiment_label.capitalize(),
	stackgroup="one",
	line=dict(width=2, color=colors[idx]),
	fillcolor=colors[idx],
	hoverinfo="y+name",
	)
	)
	fig.update_xaxes(showgrid=False)
	fig.update_yaxes(showgrid=False)
	fig.update_layout(
	title={
	"text": "Sentimiento a través del tiempo",
	"x": 0.2,
	"y": 1,
	"xanchor": "center",
	"yanchor": "top",
	"font": {"size": 15, "color": "#808495", "family": "Arial"},
	},
	xaxis_title="Fecha",
	yaxis_title="Conteo",
	hovermode="x",
	showlegend=True,
	autosize=False,
	height=250,
	width=500,
	margin=dict(l=0, r=0, t=40, b=0),
	plot_bgcolor="white",
	paper_bgcolor="white",
	)

	return fig


	##############################################################################################################################


	def crear_grafico_dispersion(df):
	fig = px.scatter(
	df,
	x="likeCount",
	y="sentiment_label",
	color="sentiment_label",
	labels={
	"likeCount": "Número de Likes",
	"sentiment_label": "Etiqueta de Sentimiento",
	},
	title="Relación entre Número de Likes y Etiquetas de Sentimiento",
	)

	fig.update_layout(
	title_y=1,
	title_font=dict(color="#808495", size=15),
	autosize=True,
	height=250,
	margin=dict(l=0, r=0, t=20, b=0),
	# xaxis=dict(visible=False),
	# yaxis=dict(visible=False)
	)

	return fig


	def bubble_fig(df):
	bubble_chart_data = (
	df.groupby("account_creation_time").size().reset_index(name="user_count")
	)
	bubble_fig = px.scatter(
	bubble_chart_data,
	x="account_creation_time",
	y="user_count",
	size="user_count",
	title="Tiempo de Creación de Cuenta<br>vs. Número de Usuarios",
	labels={
	"account_creation_time": "Tiempo de Creación de Cuenta (meses)",
	"user_count": "Número de Usuarios",
	},
	)
	return bubble_fig


	def hist_fig(df):
	hist_fig = px.histogram(
	df,
	x="account_creation_time",
	title="Distribución del Tiempo de Creación de Cuenta",
	labels={
	"account_creation_time": "Tiempo de Creación de Cuenta (meses)",
	"user_count": "Número de Usuarios",
	},
	nbins=25,
	)

	return hist_fig


	def stacked_bar_fig(df):
	stacked_bar_fig = px.histogram(
	df,
	x="account_creation_time",
	color="sentiment_label",
	title="Distribución del Tiempo de <br>Creación de Cuenta por Sentimiento de Comentario",
	labels={
	"account_creation_time": "Tiempo de Creación de Cuenta (meses)",
	"count": "Número de Usuarios",
	"sentiment_beto": "Sentimiento",
	},
	barmode="stack",
	nbins=25,
	)
	return stacked_bar_fig


	def metrics_bar(tweet_data, df):
	st.write(
	"""
	<style>
	div[data-testid="stMetric"]
	{
	background-color: #00000005;
	color: black;
	padding: 10px 0 0 10px;
	border-radius: 5px;
	}
	</style>

	""",
	unsafe_allow_html=True,
	)

	avg_time = df["account_creation_time"].mean()
	min_time = df["account_creation_time"].min()
	max_time = df["account_creation_time"].max()

	left, right = st.columns([2, 1])

	with left:
	with st.container(border=True):
	# st.write("###### Analysis of Time Metrics")
	col1, col2, col3 = st.columns(3)
	col1.metric("Tiempo Promedio", f"{round(avg_time/12)} años")
	col2.metric("### Tiempo Mínimo", f"{min_time} meses")
	col3.metric("Tiempo Máximo", f"{round(max_time/12)} años")

	with right:
	with st.container(border=True):
	# st.write("###### Sentiment Breakdown")
	pos, neu, neg = st.columns(3)
	# st.info(f"##### Overall Sentiment: :{TEXT_COLOR[tweet_data['overall_sentiment'].lower()]}[{tweet_data['overall_sentiment']}]")
	pos.metric(label=":green[Positive]", value=tweet_data["positive"])
	neu.metric(label=":gray[Neutral]", value=tweet_data["neutral"])
	neg.metric(label=":red[Negative]", value=tweet_data["negative"])