Spaces:
Build error
Build error
import pandas as pd | |
import numpy as np | |
import nltk | |
from nltk.tokenize import word_tokenize, RegexpTokenizer | |
from nltk.corpus import stopwords | |
from nltk.stem import SnowballStemmer | |
from sklearn.feature_extraction.text import CountVectorizer | |
import csv | |
import re | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
stopwords_es = stopwords.words('spanish') | |
spanish_stemmer = SnowballStemmer('spanish') | |
def remove_html_markup(s): | |
tag = False | |
quote = False | |
out = "" | |
for c in s: | |
if c == '<' and not quote: | |
tag = True | |
elif c == '>' and not quote: | |
tag = False | |
elif (c == '"' or c == "'") and tag: | |
quote = not quote | |
elif not tag: | |
out = out + c | |
return out | |
def remove_URL(s): | |
"""Remove URLs from a sample string""" | |
return re.sub(r"http\S+", "", s) | |
def eliminar_puntuacion(articulo): | |
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"] | |
new_articulo = "" | |
for x in articulo: | |
if x not in deletetion_symbols: | |
new_articulo += x | |
return new_articulo | |
def eliminar_stopwords(articulo): | |
articulo_splitted=articulo.split() | |
new_articulo = "" | |
for x in articulo_splitted: | |
if x not in stopwords_es: | |
new_articulo += " " + x | |
return new_articulo | |
def obtener_raices(articulo): | |
articulo_splitted=articulo.split() | |
new_articulo = "" | |
for x in articulo_splitted: | |
x_new = spanish_stemmer.stem(x) | |
new_articulo += " " + x_new | |
return new_articulo | |
def limpieza_articulos(df): | |
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) | |
# Colocando texto en minusculas | |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower()) | |
# Eliminando signos de puntuacion | |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x)) | |
# Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk | |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x)) | |
all_text = ' '. join(df_titulos['titulo']) | |
vocab= np.unique(word_tokenize(all_text)) | |
return vocab | |