import re import nltk from nltk.corpus import stopwords def eliminar_acento(s): replacements = ( ("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"), ) for a, b in replacements: s = s.replace(a, b).replace(a.upper(), b.upper()) return s def eliminar_patrones_stopwords(text): nltk.download('stopwords') lstopwords = set(stopwords.words('spanish')) text = [word for word in text.strip().split() if not word in lstopwords] text = ' '.join(text) return text def eliminar_espacios_blancos(texto): texto = re.sub(r"\:|\_", '', texto) texto = re.sub(r"o\/a", 'o', texto) texto = re.sub(r'[^\w\s]', '', texto) return texto def clean_text(original): original = re.sub(r'\w+(?:\.+\w+)*', lambda x: x.group(0).replace('.', ' '), original) original = re.sub(r'\.','' , original) texto = eliminar_acento(original) texto = eliminar_espacios_blancos(texto) texto = re.sub(r" +", ' ', texto) texto = texto.lower() texto = eliminar_patrones_stopwords(texto) original = re.sub(r" +", ' ', texto) return texto