File size: 3,569 Bytes
c188624
 
eb5e9dd
c188624
 
0196371
bbb3da3
 
0196371
c188624
 
07ec1a0
c188624
eb5e9dd
ec377f7
c188624
0196371
c188624
562d4dc
bbb3da3
 
562d4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5038d1a
cf0baed
5038d1a
 
562d4dc
c188624
 
 
 
 
 
 
 
62372a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c188624
 
 
 
 
 
 
 
 
534c64a
0196371
 
 
 
 
 
 
 
c188624
 
 
 
 
 
 
 
 
 
 
 
f9c172a
bbb3da3
 
 
c2e7cf8
bbb3da3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import textacy
import spacy

from sklearn.feature_extraction.text import CountVectorizer
import csv
import re

nltk.download('stopwords')
nltk.download('punkt')
stopwords_es = stopwords.words('spanish')
spanish_stemmer = SnowballStemmer('spanish')




def remove_html_markup(s):
    tag = False
    quote = False
    out = ""

    for c in s:
            if c == '<' and not quote:
                tag = True
            elif c == '>' and not quote:
                tag = False
            elif (c == '"' or c == "'") and tag:
                quote = not quote
            elif not tag:
                out = out + c

    return out

def remove_URL(s):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", s)    
    
def eliminar_puntuacion(articulo):
    deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
    new_articulo = ""
    for x in articulo:
        if x not in deletetion_symbols:
            new_articulo += x
    return new_articulo  

def remove_emoji(s):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',s)

def remover_casos_especiales(s):
    #Removiendo texto que termina con .-, ya que usualmente es un texto que se usa como inicio de algunos articulos
    s= re.sub(r'^\w+(,)*([\s]\w+)*([\s]\(\w+\))*.-','',s)    
    return s

def frases_remover(s):
    lista_frases_remover=['La entrada', 'la entrada', '(Seguir leyendo…)', 'se publicó primero en', 'Remolacha - Noticias Republica Dominicana', 'Read more ›', 'Read more','[…]', 'RELACIONADAS']
    for l in lista_frases_remover:
        s = s.replace(l, '')
    return s            

def eliminar_stopwords(articulo):

    articulo_splitted=articulo.split()
    new_articulo = ""
    for x in articulo_splitted:
        if x not in stopwords_es:
            new_articulo += " " + x
    return new_articulo  

def obtener_raices(articulo):

    articulo_splitted=articulo.split()
    new_articulo = ""
    for x in articulo_splitted:
        x_new = spanish_stemmer.stem(x)
        new_articulo += " " + x_new
    return new_articulo 
    
def limpieza_articulos(df):
    
    df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
    # Colocando texto en minusculas
    df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower())
    # Eliminando signos de puntuacion
    df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x))
    # Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk
    df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x))
    all_text = ' '. join(df_titulos['titulo'])
    vocab= np.unique(word_tokenize(all_text))
    return vocab

def obtener_kpes(df):
    df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
    all_text = ' '. join(df_titulos['titulo'])
    titulos=textacy.make_spacy_doc(all_text, lang='es_core_web_sm')
    return textacy.extract.keyterms.textrank(titulos,topn=10)