Spaces:
Sleeping
Sleeping
fcernafukuzaki
commited on
Upload 2 files
Browse files- app.py +197 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
# Pandas
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
# Expresiones regulares
|
8 |
+
import re
|
9 |
+
|
10 |
+
# Matplotlib, Seaborn y Plotly
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import seaborn as sns
|
13 |
+
|
14 |
+
# NLTK
|
15 |
+
import nltk
|
16 |
+
from nltk.corpus import stopwords
|
17 |
+
|
18 |
+
# spaCy
|
19 |
+
import spacy
|
20 |
+
|
21 |
+
# PySentimiento y Transformers
|
22 |
+
from pysentimiento import create_analyzer
|
23 |
+
from sentence_transformers import SentenceTransformer
|
24 |
+
|
25 |
+
# Word cloud
|
26 |
+
from PIL import Image
|
27 |
+
|
28 |
+
import uuid
|
29 |
+
import gradio as gr
|
30 |
+
|
31 |
+
nltk.download('stopwords')
|
32 |
+
nltk.download('punkt')
|
33 |
+
|
34 |
+
|
35 |
+
### Reformado. Antes hacia reproceso para obtener output y probas. Se puede hacer en un paso.
|
36 |
+
def get_sentiment(df,column):
|
37 |
+
analyzer = create_analyzer(task="sentiment", lang="es")
|
38 |
+
analyzer_outputs = []
|
39 |
+
with tqdm(total=len(df), desc="Analyzing Comments") as pbar:
|
40 |
+
# Iterate through each element in the DataFrame column
|
41 |
+
for element in df[column]:
|
42 |
+
# Perform sentiment analysis on each element
|
43 |
+
result = analyzer.predict(element)
|
44 |
+
# Append the result to the list
|
45 |
+
analyzer_outputs.append(result)
|
46 |
+
# Update the progress bar
|
47 |
+
pbar.update(1)
|
48 |
+
|
49 |
+
# Extracting values into columns
|
50 |
+
output_list = [output.output for output in analyzer_outputs]
|
51 |
+
NEU_list = [output.probas.get('NEU', None) for output in analyzer_outputs]
|
52 |
+
NEG_list = [output.probas.get('NEG', None) for output in analyzer_outputs]
|
53 |
+
POS_list = [output.probas.get('POS', None) for output in analyzer_outputs]
|
54 |
+
|
55 |
+
# Assigning lists to DataFrame columns
|
56 |
+
df['Polaridad'] = output_list
|
57 |
+
df['sent_NEU'] = NEU_list
|
58 |
+
df['sent_NEG'] = NEG_list
|
59 |
+
df['sent_POS'] = POS_list
|
60 |
+
return df
|
61 |
+
|
62 |
+
### Reformado. Antes hacia reproceso para obtener output y probas. Se puede hacer en un paso.
|
63 |
+
def get_emotions(df,column):
|
64 |
+
analyzer = create_analyzer(task="emotion", lang="es")
|
65 |
+
analyzer_outputs = []
|
66 |
+
with tqdm(total=len(df), desc="Analyzing Comments") as pbar:
|
67 |
+
# Iterate through each element in the DataFrame column
|
68 |
+
for element in df[column]:
|
69 |
+
# Perform sentiment analysis on each element
|
70 |
+
result = analyzer.predict(element)
|
71 |
+
# Append the result to the list
|
72 |
+
analyzer_outputs.append(result)
|
73 |
+
# Update the progress bar
|
74 |
+
pbar.update(1)
|
75 |
+
|
76 |
+
# Extracting values into columns
|
77 |
+
output_list = [output.output for output in analyzer_outputs]
|
78 |
+
anger_list = [output.probas.get('anger', None) for output in analyzer_outputs]
|
79 |
+
sadness_list = [output.probas.get('sadness', None) for output in analyzer_outputs]
|
80 |
+
surprise_list = [output.probas.get('surprise', None) for output in analyzer_outputs]
|
81 |
+
disgust_list = [output.probas.get('disgust', None) for output in analyzer_outputs]
|
82 |
+
joy_list = [output.probas.get('joy', None) for output in analyzer_outputs]
|
83 |
+
fear_list = [output.probas.get('fear', None) for output in analyzer_outputs]
|
84 |
+
others_list = [output.probas.get('others', None) for output in analyzer_outputs]
|
85 |
+
|
86 |
+
# Assigning lists to DataFrame columns
|
87 |
+
df['Emocion'] = output_list
|
88 |
+
df['emo_anger'] = anger_list
|
89 |
+
df['emo_sadness'] = sadness_list
|
90 |
+
df['emo_surprise'] = surprise_list
|
91 |
+
df['emo_disgust'] = disgust_list
|
92 |
+
df['emo_joy'] = joy_list
|
93 |
+
df['emo_fear'] = fear_list
|
94 |
+
df['emo_others'] = others_list
|
95 |
+
return df
|
96 |
+
|
97 |
+
class ProcesamientoLenguaje:
|
98 |
+
def __init__(self):
|
99 |
+
self.nlp = spacy.load('es_core_news_md', disable=["parser", "ner"])
|
100 |
+
|
101 |
+
def postags_and_stopwords(self, texts, allowed_postags=['NOUN', 'ADJ','PROPN', 'VB', 'X']):
|
102 |
+
|
103 |
+
'''Función que procesa todos los textos en un pipeline de spaCy para tokenizar y etiquetar las POS.
|
104 |
+
Luego, filtra todas las palabras de longitud mayor a 2 caracteres que no sean stop words y que se encuentren
|
105 |
+
dentro de las etiquetas permitidas: sustantivo, adjetivo, verbo, nombre propio y todo lo que no caiga en una categoría
|
106 |
+
preestablecida (palabras OOV, nombres propios no reconocidos, etc).
|
107 |
+
Devuelve los textos procesados.
|
108 |
+
'''
|
109 |
+
|
110 |
+
texts_out = ' '.join([token.text for token in self.nlp(texts) if token.pos_ in
|
111 |
+
allowed_postags and token.text not in stop_words and len(token.text) > 2])
|
112 |
+
return texts_out
|
113 |
+
|
114 |
+
def cleaner(self, word):
|
115 |
+
|
116 |
+
'''Función que toma un texto y remueve distintos símbolos y variaciones de palabras.
|
117 |
+
Devuelve el string limpio.
|
118 |
+
'''
|
119 |
+
|
120 |
+
word = re.sub(r'https?\S+', '', word) #remueve todas las URLs
|
121 |
+
word = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', "", word) #remueve interrogación, paréntesis, dos puntos, etc
|
122 |
+
word = re.sub(r'ee.uu', 'eeuu', word, flags=re.IGNORECASE) #convierte todas las variaciones de EEUU sin importar el separador en EEUU
|
123 |
+
word = re.sub(r'\#\.', '', word)
|
124 |
+
word = re.sub(r'\n', ' ', word) #remueve todos los line-breaks y los reemplaza con espacios
|
125 |
+
word = re.sub(r',', '', word) #remueve comas
|
126 |
+
word = re.sub(r'\-', ' ', word) #remueve guiones
|
127 |
+
word = re.sub(r'\.{3}', ' ', word) #remueve tres puntos
|
128 |
+
word = re.sub(r'a{2,}', 'a', word) #remueve múltiples instancias de la letra a (p.ej: aaaaaaah, holaaaaaa)
|
129 |
+
word = re.sub(r'é{2,}', 'é', word) #remueve múltiples instancias de la letra é (p.ej: volvééééé)
|
130 |
+
word = re.sub(r'i{2,}', 'i', word) #remueve múltiples instancias de la letra i (p.ej: salíiiiiii)
|
131 |
+
word = re.sub(r'ja{2,}', 'ja', word) #remueve las "risas" (p.ej: jaaaaaa)
|
132 |
+
word = re.sub(r'[^\w\s@ñ]', '', word, flags=re.UNICODE) #remueve todos los símbolos no alfanuméricos excepto @ y ñ
|
133 |
+
word = re.sub(r'\b@\w+\b', '', word) #remueve todos los usuarios de Twitter
|
134 |
+
word = re.sub(r'\b\w{1,2}\b', '', word) #remueve todas las palabras de una o dos letras
|
135 |
+
|
136 |
+
return word
|
137 |
+
|
138 |
+
def grafico_pie(df, column_name='Polaridad'):
|
139 |
+
file_path = f"{uuid.uuid4()}_sentimiento.jpg"
|
140 |
+
plt.figure(figsize=(8, 6))
|
141 |
+
polaridad_counts = df[column_name].value_counts()
|
142 |
+
plt.pie(polaridad_counts, labels=polaridad_counts.index, autopct='%1.1f%%', startangle=140)
|
143 |
+
plt.title("Distribución de Polaridad")
|
144 |
+
plt.savefig(file_path, bbox_inches="tight")
|
145 |
+
plt.close()
|
146 |
+
return file_path
|
147 |
+
|
148 |
+
def grafico_barras(df, column_name='Emocion'):
|
149 |
+
file_path = f"{uuid.uuid4()}_sentimiento.jpg"
|
150 |
+
plt.figure(figsize=(8, 6))
|
151 |
+
ax = sns.countplot(x=column_name, data=df)
|
152 |
+
for p in ax.patches:
|
153 |
+
ax.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
|
154 |
+
plt.xlabel("Emocion")
|
155 |
+
plt.ylabel("Cantidad")
|
156 |
+
plt.title("Histograma de Emocion")
|
157 |
+
plt.savefig(file_path, bbox_inches="tight")
|
158 |
+
plt.close()
|
159 |
+
return file_path
|
160 |
+
|
161 |
+
pln = ProcesamientoLenguaje()
|
162 |
+
stop_words = stopwords.words('spanish')
|
163 |
+
|
164 |
+
# Función que lee el archivo CSV
|
165 |
+
def procesar_csv(file):
|
166 |
+
if file is None:
|
167 |
+
return "No se ha cargado ningún archivo."
|
168 |
+
df = pd.read_csv(file.name, delimiter=';')
|
169 |
+
df['Fecha'] = pd.to_datetime(df['Fecha'], format='%d/%m/%y')
|
170 |
+
df = get_sentiment(df, "Comentario")
|
171 |
+
df = get_emotions(df, "Comentario")
|
172 |
+
|
173 |
+
df['Comentario_clean'] = df['Comentario'].apply(pln.cleaner)
|
174 |
+
df['Comentario_clean'] = df['Comentario_clean'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
|
175 |
+
df['Comentario_clean'] = df['Comentario_clean'].apply(pln.postags_and_stopwords)
|
176 |
+
output_file = f"{uuid.uuid4()}_processed_output.csv"
|
177 |
+
df.to_csv(output_file, index=False)
|
178 |
+
|
179 |
+
grafico_pie_path = grafico_pie(df)
|
180 |
+
grafico_barras_path = grafico_barras(df)
|
181 |
+
return df.head(10), output_file, grafico_pie_path, grafico_barras_path # Muestra las primeras filas
|
182 |
+
|
183 |
+
|
184 |
+
# Crear la interfaz en Gradio
|
185 |
+
interface = gr.Interface(
|
186 |
+
fn=procesar_csv,
|
187 |
+
inputs=gr.File(label="Archivo CSV"),
|
188 |
+
outputs=[gr.Dataframe(label="Vista previa del archivo procesado"),
|
189 |
+
gr.File(label="Descargar CSV procesado"),
|
190 |
+
gr.Image(type="filepath", label="Gráfico de torta"),
|
191 |
+
gr.Image(type="filepath", label="Gráfico de barras")],
|
192 |
+
title="Cargar y visualizar CSV",
|
193 |
+
description="Sube un archivo CSV para ver los primeros registros. El archivo CSV debe tener los campos Fecha y Comentario."
|
194 |
+
)
|
195 |
+
|
196 |
+
# Ejecutar la app de Gradio
|
197 |
+
interface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
matplotlib==3.9.2
|
2 |
+
seaborn==0.13.2
|
3 |
+
nltk==3.9.1
|
4 |
+
pysentimiento==0.7.3
|
5 |
+
sentence-transformers==3.2.1
|
6 |
+
spacy==3.7.5
|
7 |
+
tqdm==4.66.6
|
8 |
+
es-core-news-md @ https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.7.0/es_core_news_md-3.7.0-py3-none-any.whl#sha256=0d6d6ebed875869a9759c8c096f2cef581fa32d861646030f771c83e5799de82
|