import pandas as pd |
import numpy as np |
import nltk |
from nltk.tokenize import word_tokenize, RegexpTokenizer |
from nltk.corpus import stopwords |
from nltk.stem import SnowballStemmer |
from sklearn.feature_extraction.text import CountVectorizer |
import csv |
import re |
nltk.download('stopwords') |
nltk.download('punkt') |
stopwords_es = stopwords.words('spanish') |
spanish_stemmer = SnowballStemmer('spanish') |
def remove_html_markup(s): |
tag = False |
quote = False |
out = "" |
for c in s: |
if c == '<' and not quote: |
tag = True |
elif c == '>' and not quote: |
tag = False |
elif (c == '"' or c == "'") and tag: |
quote = not quote |
elif not tag: |
out = out + c |
return out |
def remove_URL(s): |
"""Remove URLs from a sample string""" |
return re.sub(r"http\S+", "", s) |
def eliminar_puntuacion(articulo): |
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"] |
new_articulo = "" |
for x in articulo: |
if x not in deletetion_symbols: |
new_articulo += x |
return new_articulo |
def eliminar_stopwords(articulo): |
articulo_splitted=articulo.split() |
new_articulo = "" |
for x in articulo_splitted: |
if x not in stopwords_es: |
new_articulo += " " + x |
return new_articulo |
def obtener_raices(articulo): |
articulo_splitted=articulo.split() |
new_articulo = "" |
for x in articulo_splitted: |
x_new = spanish_stemmer.stem(x) |
new_articulo += " " + x_new |
return new_articulo |
def limpieza_articulos(df): |
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower()) |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x)) |
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x)) |
all_text = ' '. join(df_titulos['titulo']) |
vocab= np.unique(word_tokenize(all_text)) |
return vocab |