File size: 600 Bytes
4ee3884
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import re
import math

def cleaning(text):
    if isinstance(text, str): 
        text = text.lower()
        text = re.sub(r'[^ ,.?!a-z0-9àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệđìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵ]', '', text)
        text = re.sub(r'[.!?]+', '.', text)
        text = re.sub(r'([,.!?])\1+', r'\1', text)
        text = re.sub(r'[,]+', ' , ', text)
        text = re.sub(r'[.]+', ' . ', text)
        text = re.sub(r'([ ])\1+', r'\1', text)
        return text
    return text