Spaces:
Paused
Paused
File size: 947 Bytes
0cc2cbd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import os, re, string
import subprocess
from textblob_de import TextBlobDE as TextBlob
def clean_english(text):
clean_text = re.sub(r' ', ' ', text)
clean_text = re.sub(r'\bi\s', 'I ', clean_text)
clean_text = re.sub(r'\si$', ' I', clean_text)
clean_text = re.sub(r'i\'', 'I\'', clean_text)
return clean_text
def clean_german(text):
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize German text
blob = TextBlob(text)
pos = blob.tags
# Get nouns and capitalize
nouns = {}
for idx in pos:
if idx[1] == 'NN' and len(idx[0]) > 1:
nouns[idx[0]] = idx[0].capitalize()
if len(nouns) != 0:
pattern = re.compile("|".join(nouns.keys()))
text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text)
return text
def clean_spanish(text):
clean_text = text.translate(str.maketrans('', '', string.punctuation))
clean_text = re.sub(r' ', ' ', clean_text)
return clean_text
|