Update preprocesamiento_articulos.py
Browse files
preprocesamiento_articulos.py
CHANGED
@@ -3,12 +3,15 @@ import numpy as np
|
|
3 |
import nltk
|
4 |
from nltk.tokenize import word_tokenize, RegexpTokenizer
|
5 |
from nltk.corpus import stopwords
|
|
|
|
|
6 |
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
import csv
|
8 |
|
9 |
nltk.download('stopwords')
|
10 |
nltk.download('punkt')
|
11 |
stopwords_es = stopwords.words('spanish')
|
|
|
12 |
|
13 |
def eliminar_puntuacion(articulo):
|
14 |
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
|
@@ -27,6 +30,15 @@ def eliminar_stopwords(articulo):
|
|
27 |
new_articulo += " " + x
|
28 |
return new_articulo
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def limpieza_articulos(df):
|
31 |
|
32 |
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
|
|
|
3 |
import nltk
|
4 |
from nltk.tokenize import word_tokenize, RegexpTokenizer
|
5 |
from nltk.corpus import stopwords
|
6 |
+
from nltk.stem import SnowballStemmer
|
7 |
+
|
8 |
from sklearn.feature_extraction.text import CountVectorizer
|
9 |
import csv
|
10 |
|
11 |
nltk.download('stopwords')
|
12 |
nltk.download('punkt')
|
13 |
stopwords_es = stopwords.words('spanish')
|
14 |
+
spanish_stemmer = SnowballStemmer('spanish')
|
15 |
|
16 |
def eliminar_puntuacion(articulo):
|
17 |
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
|
|
|
30 |
new_articulo += " " + x
|
31 |
return new_articulo
|
32 |
|
33 |
+
def obtener_raices(articulo)
|
34 |
+
|
35 |
+
articulo_splitted=articulo.split()
|
36 |
+
new_articulo = ""
|
37 |
+
for x in articulo_splitted:
|
38 |
+
x_new = spanish_stemmer.stem(x)
|
39 |
+
new_articulo += " " + x_new
|
40 |
+
return new_articulo
|
41 |
+
|
42 |
def limpieza_articulos(df):
|
43 |
|
44 |
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
|