seek007 commited on
Commit
31d5dab
1 Parent(s): 8432f36

Upload 2 files

Browse files
Files changed (2) hide show
  1. TweetNormalizer.py +59 -0
  2. requirements.txt +11 -0
TweetNormalizer.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from emoji import demojize
2
+ from nltk.tokenize import TweetTokenizer
3
+
4
+
5
+ tokenizer = TweetTokenizer()
6
+
7
+
8
+ def normalizeToken(token):
9
+ lowercased_token = token.lower()
10
+ if token.startswith("@"):
11
+ return "@USER"
12
+ elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
13
+ return "HTTPURL"
14
+ elif len(token) == 1:
15
+ return demojize(token)
16
+ else:
17
+ if token == "’":
18
+ return "'"
19
+ elif token == "…":
20
+ return "..."
21
+ else:
22
+ return token
23
+
24
+
25
+ def normalizeTweet(tweet):
26
+ tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
27
+ normTweet = " ".join([normalizeToken(token) for token in tokens])
28
+
29
+ normTweet = (
30
+ normTweet.replace("cannot ", "can not ")
31
+ .replace("n't ", " n't ")
32
+ .replace("n 't ", " n't ")
33
+ .replace("ca n't", "can't")
34
+ .replace("ai n't", "ain't")
35
+ )
36
+ normTweet = (
37
+ normTweet.replace("'m ", " 'm ")
38
+ .replace("'re ", " 're ")
39
+ .replace("'s ", " 's ")
40
+ .replace("'ll ", " 'll ")
41
+ .replace("'d ", " 'd ")
42
+ .replace("'ve ", " 've ")
43
+ )
44
+ normTweet = (
45
+ normTweet.replace(" p . m .", " p.m.")
46
+ .replace(" p . m ", " p.m ")
47
+ .replace(" a . m .", " a.m.")
48
+ .replace(" a . m ", " a.m ")
49
+ )
50
+
51
+ return " ".join(normTweet.split())
52
+
53
+
54
+ if __name__ == "__main__":
55
+ print(
56
+ normalizeTweet(
57
+ "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
58
+ )
59
+ )
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib
2
+ transformers
3
+ matplotlib
4
+ pandas
5
+ emoji
6
+ nltk
7
+ seaborn
8
+ numpy
9
+ torch
10
+ tensorflow
11
+ tf-keras