|
import pandas as pd |
|
import numpy as np |
|
import nltk |
|
from nltk.tokenize import word_tokenize, RegexpTokenizer |
|
from nltk.corpus import stopwords |
|
from nltk.stem import SnowballStemmer |
|
import textacy |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import csv |
|
import re |
|
|
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
stopwords_es = stopwords.words('spanish') |
|
spanish_stemmer = SnowballStemmer('spanish') |
|
|
|
|
|
|
|
|
|
def remove_html_markup(s): |
|
tag = False |
|
quote = False |
|
out = "" |
|
|
|
for c in s: |
|
if c == '<' and not quote: |
|
tag = True |
|
elif c == '>' and not quote: |
|
tag = False |
|
elif (c == '"' or c == "'") and tag: |
|
quote = not quote |
|
elif not tag: |
|
out = out + c |
|
|
|
return out |
|
|
|
def remove_URL(s): |
|
"""Remove URLs from a sample string""" |
|
return re.sub(r"http\S+", "", s) |
|
|
|
def eliminar_puntuacion(articulo): |
|
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"] |
|
new_articulo = "" |
|
for x in articulo: |
|
if x not in deletetion_symbols: |
|
new_articulo += x |
|
return new_articulo |
|
|
|
def remove_emoji(s): |
|
regrex_pattern = re.compile(pattern = "[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
"]+", flags = re.UNICODE) |
|
return regrex_pattern.sub(r'',s) |
|
|
|
def remover_casos_especiales(s): |
|
|
|
s= re.sub(r'^\w+(,)*([\s]\w+)*([\s]\(\w+\))*.-','',s) |
|
return s |
|
|
|
def frases_remover(s): |
|
lista_frases_remover=['La entrada', 'la entrada', '(Seguir leyendo…)', 'se publicó primero en', 'Remolacha - Noticias Republica Dominicana', 'Read more ›', 'Read more','[…]', 'RELACIONADAS'] |
|
for l in lista_frases_remover: |
|
s = s.replace(l, '') |
|
return s |
|
|
|
def eliminar_stopwords(articulo): |
|
|
|
articulo_splitted=articulo.split() |
|
new_articulo = "" |
|
for x in articulo_splitted: |
|
if x not in stopwords_es: |
|
new_articulo += " " + x |
|
return new_articulo |
|
|
|
def obtener_raices(articulo): |
|
|
|
articulo_splitted=articulo.split() |
|
new_articulo = "" |
|
for x in articulo_splitted: |
|
x_new = spanish_stemmer.stem(x) |
|
new_articulo += " " + x_new |
|
return new_articulo |
|
|
|
def limpieza_articulos(df): |
|
|
|
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) |
|
|
|
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower()) |
|
|
|
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x)) |
|
|
|
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x)) |
|
all_text = ' '. join(df_titulos['titulo']) |
|
vocab= np.unique(word_tokenize(all_text)) |
|
return vocab |
|
|
|
def obtener_kpes(df): |
|
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) |
|
all_text = '. '. join(df_titulos['titulo']) |
|
titulos=textacy.make_spacy_doc(all_text, lang='es_core_news_sm') |
|
return textacy.extract.keyterms.textrank(titulos,normalize='lemma',topn=10) |
|
|