|
import pandas as pd |
|
import numpy as np |
|
import nltk |
|
from nltk.tokenize import word_tokenize, RegexpTokenizer |
|
from nltk.corpus import stopwords |
|
from nltk.stem import SnowballStemmer |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import csv |
|
import re |
|
|
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
stopwords_es = stopwords.words('spanish') |
|
spanish_stemmer = SnowballStemmer('spanish') |
|
|
|
|
|
def remove_html_markup(s): |
|
tag = False |
|
quote = False |
|
out = "" |
|
|
|
for c in s: |
|
if c == '<' and not quote: |
|
tag = True |
|
elif c == '>' and not quote: |
|
tag = False |
|
elif (c == '"' or c == "'") and tag: |
|
quote = not quote |
|
elif not tag: |
|
out = out + c |
|
|
|
return out |
|
|
|
def remove_URL(s): |
|
"""Remove URLs from a sample string""" |
|
return re.sub(r"http\S+", "", s) |
|
|
|
def eliminar_puntuacion(articulo): |
|
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"] |
|
new_articulo = "" |
|
for x in articulo: |
|
if x not in deletetion_symbols: |
|
new_articulo += x |
|
return new_articulo |
|
|
|
def eliminar_stopwords(articulo): |
|
|
|
articulo_splitted=articulo.split() |
|
new_articulo = "" |
|
for x in articulo_splitted: |
|
if x not in stopwords_es: |
|
new_articulo += " " + x |
|
return new_articulo |
|
|
|
def obtener_raices(articulo): |
|
|
|
articulo_splitted=articulo.split() |
|
new_articulo = "" |
|
for x in articulo_splitted: |
|
x_new = spanish_stemmer.stem(x) |
|
new_articulo += " " + x_new |
|
return new_articulo |
|
|
|
def limpieza_articulos(df): |
|
|
|
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) |
|
|
|
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower()) |
|
|
|
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x)) |
|
|
|
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x)) |
|
all_text = ' '. join(df_titulos['titulo']) |
|
vocab= np.unique(word_tokenize(all_text)) |
|
return vocab |
|
|