File size: 947 Bytes
0cc2cbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os, re, string
import subprocess
from textblob_de import TextBlobDE as TextBlob


def clean_english(text):
    clean_text = re.sub(r'  ', ' ', text)
    clean_text = re.sub(r'\bi\s', 'I ', clean_text)
    clean_text = re.sub(r'\si$', ' I', clean_text)
    clean_text = re.sub(r'i\'', 'I\'', clean_text)

    return clean_text

def clean_german(text):
	text = text.translate(str.maketrans('', '', string.punctuation))
	
	# Tokenize German text
	blob = TextBlob(text)
	pos = blob.tags

	# Get nouns and capitalize 
	nouns = {}
	for idx in pos:
		if idx[1] == 'NN' and len(idx[0]) > 1:
			nouns[idx[0]] = idx[0].capitalize()
		
	if len(nouns) != 0:
		pattern = re.compile("|".join(nouns.keys()))
		text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text)
	
	return text


def clean_spanish(text):
	clean_text = text.translate(str.maketrans('', '', string.punctuation))
	clean_text = re.sub(r'  ', ' ', clean_text)

	return clean_text