mginoben commited on
Commit
bf5fae7
1 Parent(s): 4e90ce5

Profanity with hashtag detection

Browse files
Files changed (2) hide show
  1. app.py +64 -40
  2. contractions.json +4 -1
app.py CHANGED
@@ -7,8 +7,7 @@ from thefuzz import process, fuzz
7
  import numpy as np
8
  import re
9
  import nltk
10
- nltk.download('words')
11
- from nltk.corpus import words
12
 
13
 
14
  API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
@@ -36,58 +35,86 @@ def read_text(filename, filetype='txt'):
36
  contractions = read_text('contractions', 'json')
37
  similar_words = read_text('similar_words')
38
  addon_words = read_text('addon_words')
39
- profanities = read_text('profanities', 'json')
40
- lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
41
- lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
42
- eng_words = list(set(words.words()) - set(lookup_profanity))
43
  punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
44
 
45
- # TODO check eng words that are tagalog profanities
46
-
47
  def fuzzy_lookup(tweet):
48
 
49
  matched_profanity = dict()
50
 
51
- # tweet = punctuations.sub('', tweet).lower()
52
-
53
  for word in tweet.split():
54
 
 
 
 
 
 
 
 
 
55
  word = punctuations.sub('', word).lower()
 
 
56
  base_word = word
 
 
57
  word = re.sub(r'(.)\1{2,}', r'\1', word)
58
-
59
- if word in eng_words:
60
- continue
61
-
 
 
62
  for addon in addon_words:
63
  if word.startswith(addon):
64
  word = word[len(addon):]
65
  if word.endswith(addon):
66
  word = word[:-len(addon)]
67
 
68
- if word.startswith("@") or word.startswith("#"):
69
- word = word[1:]
70
 
71
- scores = []
72
- matched_words = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- if len(word) >= 4:
75
- # Get fuzzy ratio
76
- for lookup_word in lookup_words:
77
- score = fuzz.ratio(word, lookup_word)
78
- if score >= 70:
79
- scores.append(score)
80
- matched_words.append(lookup_word)
81
- if len(scores) > 0:
82
- max_score_index = np.argmax(scores)
83
- if matched_words[max_score_index] in lookup_profanity:
84
- for base_profanity, profanity_variations in profanities.items():
85
- if matched_words[max_score_index] == base_profanity:
86
- matched_profanity[base_word] = base_profanity
87
- break
88
- if matched_words[max_score_index] in profanity_variations:
89
- matched_profanity[base_word] = base_profanity
90
- break
91
 
92
  return matched_profanity
93
 
@@ -108,11 +135,6 @@ def preprocess(tweet, profanities):
108
 
109
  for index, word in enumerate(row_split):
110
 
111
- # Seperate pronouns
112
- for addon in addon_words:
113
- if word.endswith(addon):
114
- row_split[index] = word[:-len(addon)] + " " + addon
115
-
116
  # Remove links
117
  if 'http' in word:
118
  row_split[index] = ''
@@ -150,9 +172,11 @@ def predict(tweet):
150
  print(prediction)
151
  error_message = prediction['error']
152
  return error_message, {}
 
153
  prediction = prediction[0][0]["label"]
154
 
155
  print("\nTWEET:", tweet)
 
156
  print("DETECTED PROFANITY:", list(profanities.keys()))
157
  print("LABEL:", prediction, "\n")
158
 
 
7
  import numpy as np
8
  import re
9
  import nltk
10
+ from english_words import get_english_words_set
 
11
 
12
 
13
  API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
 
35
  contractions = read_text('contractions', 'json')
36
  similar_words = read_text('similar_words')
37
  addon_words = read_text('addon_words')
38
+ profanities_dict = read_text('profanities', 'json')
39
+ lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
40
+ lookup_words = list(set(similar_words).union(set(lookup_profanity)))
41
+ eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
42
  punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
43
 
 
 
44
  def fuzzy_lookup(tweet):
45
 
46
  matched_profanity = dict()
47
 
 
 
48
  for word in tweet.split():
49
 
50
+ if word in eng_words:
51
+ continue
52
+
53
+ scores = []
54
+ matched_words = []
55
+ matched_word = None
56
+
57
+ # Remove trailing punctuations except # and @
58
  word = punctuations.sub('', word).lower()
59
+
60
+ # Save base word
61
  base_word = word
62
+
63
+ # Shortent elongated word
64
  word = re.sub(r'(.)\1{2,}', r'\1', word)
65
+
66
+ # Remove # and @
67
+ if word.startswith("#") or word.startswith("@"):
68
+ word = word[1:]
69
+
70
+ # Remove trailing words (mo, ka, pinaka)
71
  for addon in addon_words:
72
  if word.startswith(addon):
73
  word = word[len(addon):]
74
  if word.endswith(addon):
75
  word = word[:-len(addon)]
76
 
77
+ if len(word) < 4:
78
+ continue
79
 
80
+ # Get fuzzy ratio
81
+ for lookup_word in lookup_words:
82
+
83
+ score = fuzz.ratio(word, lookup_word)
84
+
85
+ # Threshold
86
+ if score >= 70:
87
+ scores.append(score)
88
+ matched_words.append(lookup_word)
89
+
90
+ if len(scores) == 0:
91
+ continue
92
+
93
+ if len(set(scores)) == 1:
94
+ for matched_word in matched_words:
95
+ if matched_word in lookup_profanity:
96
+ matched_word = matched_word
97
+ break
98
+ else:
99
+ # Get matched word with max score
100
+ max_score_index = np.argmax(scores)
101
+ matched_word = matched_words[max_score_index]
102
+
103
+ if matched_word not in lookup_profanity:
104
+ continue
105
+
106
+ for base_profanity, profanity_variations in profanities_dict.items():
107
+
108
+ if matched_word in profanity_variations or matched_word == base_profanity:
109
+
110
+ # Seperate pronouns
111
+ for addon in addon_words:
112
+ if base_word.endswith(addon):
113
+ base_profanity = base_profanity + " " + addon
114
+ break
115
 
116
+ matched_profanity[base_word] = base_profanity
117
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  return matched_profanity
120
 
 
135
 
136
  for index, word in enumerate(row_split):
137
 
 
 
 
 
 
138
  # Remove links
139
  if 'http' in word:
140
  row_split[index] = ''
 
172
  print(prediction)
173
  error_message = prediction['error']
174
  return error_message, {}
175
+
176
  prediction = prediction[0][0]["label"]
177
 
178
  print("\nTWEET:", tweet)
179
+ print("PROCESSED TWEET:", preprocessed_tweet)
180
  print("DETECTED PROFANITY:", list(profanities.keys()))
181
  print("LABEL:", prediction, "\n")
182
 
contractions.json CHANGED
@@ -29,5 +29,8 @@
29
  "kelan": "kailan",
30
  "raw": "daw",
31
  "itong": "ito ang",
32
- "lng": "lang"
 
 
 
33
  }
 
29
  "kelan": "kailan",
30
  "raw": "daw",
31
  "itong": "ito ang",
32
+ "lng": "lang",
33
+ "putang ina": "putangina",
34
+ "tangina" : "tangina",
35
+ "inamo" : "ina mo"
36
  }