import os, re, string import subprocess from textblob_de import TextBlobDE as TextBlob def clean_english(text): clean_text = re.sub(r' ', ' ', text) clean_text = re.sub(r'\bi\s', 'I ', clean_text) clean_text = re.sub(r'\si$', ' I', clean_text) clean_text = re.sub(r'i\'', 'I\'', clean_text) return clean_text def clean_german(text): text = text.translate(str.maketrans('', '', string.punctuation)) # Tokenize German text blob = TextBlob(text) pos = blob.tags # Get nouns and capitalize nouns = {} for idx in pos: if idx[1] == 'NN' and len(idx[0]) > 1: nouns[idx[0]] = idx[0].capitalize() if len(nouns) != 0: pattern = re.compile("|".join(nouns.keys())) text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text) return text def clean_spanish(text): clean_text = text.translate(str.maketrans('', '', string.punctuation)) clean_text = re.sub(r' ', ' ', clean_text) return clean_text