File size: 3,569 Bytes
c188624 eb5e9dd c188624 0196371 bbb3da3 0196371 c188624 07ec1a0 c188624 eb5e9dd ec377f7 c188624 0196371 c188624 562d4dc bbb3da3 562d4dc 5038d1a cf0baed 5038d1a 562d4dc c188624 62372a3 c188624 534c64a 0196371 c188624 f9c172a bbb3da3 c2e7cf8 bbb3da3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import textacy
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import csv
import re
nltk.download('stopwords')
nltk.download('punkt')
stopwords_es = stopwords.words('spanish')
spanish_stemmer = SnowballStemmer('spanish')
def remove_html_markup(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote:
tag = True
elif c == '>' and not quote:
tag = False
elif (c == '"' or c == "'") and tag:
quote = not quote
elif not tag:
out = out + c
return out
def remove_URL(s):
"""Remove URLs from a sample string"""
return re.sub(r"http\S+", "", s)
def eliminar_puntuacion(articulo):
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
new_articulo = ""
for x in articulo:
if x not in deletetion_symbols:
new_articulo += x
return new_articulo
def remove_emoji(s):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',s)
def remover_casos_especiales(s):
#Removiendo texto que termina con .-, ya que usualmente es un texto que se usa como inicio de algunos articulos
s= re.sub(r'^\w+(,)*([\s]\w+)*([\s]\(\w+\))*.-','',s)
return s
def frases_remover(s):
lista_frases_remover=['La entrada', 'la entrada', '(Seguir leyendo…)', 'se publicó primero en', 'Remolacha - Noticias Republica Dominicana', 'Read more ›', 'Read more','[…]', 'RELACIONADAS']
for l in lista_frases_remover:
s = s.replace(l, '')
return s
def eliminar_stopwords(articulo):
articulo_splitted=articulo.split()
new_articulo = ""
for x in articulo_splitted:
if x not in stopwords_es:
new_articulo += " " + x
return new_articulo
def obtener_raices(articulo):
articulo_splitted=articulo.split()
new_articulo = ""
for x in articulo_splitted:
x_new = spanish_stemmer.stem(x)
new_articulo += " " + x_new
return new_articulo
def limpieza_articulos(df):
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
# Colocando texto en minusculas
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower())
# Eliminando signos de puntuacion
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x))
# Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x))
all_text = ' '. join(df_titulos['titulo'])
vocab= np.unique(word_tokenize(all_text))
return vocab
def obtener_kpes(df):
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
all_text = ' '. join(df_titulos['titulo'])
titulos=textacy.make_spacy_doc(all_text, lang='es_core_web_sm')
return textacy.extract.keyterms.textrank(titulos,topn=10) |