Gladiator commited on
Commit
85ebc15
1 Parent(s): cdf5e79

add clean text func

Browse files
Files changed (1) hide show
  1. src/utils.py +29 -0
src/utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ emoji_pattern = re.compile(
4
+ "["
5
+ u"\U0001F600-\U0001F64F" # emoticons
6
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
7
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
8
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
9
+ u"\U00002702-\U000027B0"
10
+ u"\U000024C2-\U0001F251"
11
+ "]+",
12
+ flags=re.UNICODE,
13
+ )
14
+
15
+
16
+ def clean_text(x):
17
+ x = x.lower() # lowercase
18
+ x = x.encode("ascii", "ignore").decode() # unicode
19
+ x = re.sub(r"https*\S+", " ", x) # url
20
+ x = re.sub(r"@\S+", " ", x) # mentions
21
+ x = re.sub(r"#\S+", " ", x) # hastags
22
+ x = x.replace("'", "") # remove ticks
23
+ # x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
24
+ # x = re.sub(r"\w*\d+\w*", "", x) # numbers
25
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
26
+ x = emoji_pattern.sub(r"", x) # emojis
27
+ x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
28
+
29
+ return x