mginoben commited on
Commit
fa21182
1 Parent(s): 8adc428

Modified app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -13
app.py CHANGED
@@ -6,7 +6,6 @@ import json
6
  from thefuzz import process, fuzz
7
  import numpy as np
8
  import re
9
- from string import punctuation
10
  import nltk
11
  nltk.download('words')
12
  from nltk.corpus import words
@@ -47,7 +46,7 @@ punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
47
 
48
  def fuzzy_lookup(tweet):
49
 
50
- matched_profanity = []
51
 
52
  # tweet = punctuations.sub('', tweet).lower()
53
 
@@ -55,6 +54,7 @@ def fuzzy_lookup(tweet):
55
 
56
  word = punctuations.sub('', word).lower()
57
  base_word = word
 
58
 
59
  if word in eng_words:
60
  continue
@@ -71,8 +71,6 @@ def fuzzy_lookup(tweet):
71
  scores = []
72
  matched_words = []
73
 
74
- print(word)
75
-
76
  if len(word) >= 4:
77
  # Get fuzzy ratio
78
  for lookup_word in lookup_words:
@@ -83,16 +81,26 @@ def fuzzy_lookup(tweet):
83
  if len(scores) > 0:
84
  max_score_index = np.argmax(scores)
85
  if matched_words[max_score_index] in lookup_profanity:
86
- matched_profanity.append(base_word)
87
-
 
 
 
 
 
 
88
  return matched_profanity
89
 
90
 
91
- def preprocess(tweet):
92
 
93
  tweet = tweet.lower()
94
  tweet = emoji.replace_emoji(tweet, replace='')
95
 
 
 
 
 
96
  # Elongated words conversion
97
  tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
98
 
@@ -125,10 +133,11 @@ def preprocess(tweet):
125
 
126
  def predict(tweet):
127
 
128
- preprocessed_tweet = preprocess(tweet)
129
- matched_profanity = fuzzy_lookup(preprocessed_tweet)
130
 
131
- if len(matched_profanity) > 0:
 
 
132
 
133
  prediction = query(preprocessed_tweet)
134
 
@@ -139,10 +148,10 @@ def predict(tweet):
139
  prediction = prediction[0][0]["label"]
140
 
141
  print("\nTWEET:", tweet)
142
- print("DETECTED PROFANITY:", matched_profanity)
143
  print("LABEL:", prediction, "\n")
144
 
145
- return prediction, matched_profanity
146
 
147
  return "No Profanity", {}
148
 
@@ -165,4 +174,7 @@ demo = gr.Interface(
165
  title="Tagalog Profanity Classifier"
166
  )
167
 
168
- demo.launch(debug=True)
 
 
 
 
6
  from thefuzz import process, fuzz
7
  import numpy as np
8
  import re
 
9
  import nltk
10
  nltk.download('words')
11
  from nltk.corpus import words
 
46
 
47
  def fuzzy_lookup(tweet):
48
 
49
+ matched_profanity = dict()
50
 
51
  # tweet = punctuations.sub('', tweet).lower()
52
 
 
54
 
55
  word = punctuations.sub('', word).lower()
56
  base_word = word
57
+ word = re.sub(r'(.)\1{2,}', r'\1', word)
58
 
59
  if word in eng_words:
60
  continue
 
71
  scores = []
72
  matched_words = []
73
 
 
 
74
  if len(word) >= 4:
75
  # Get fuzzy ratio
76
  for lookup_word in lookup_words:
 
81
  if len(scores) > 0:
82
  max_score_index = np.argmax(scores)
83
  if matched_words[max_score_index] in lookup_profanity:
84
+ for base_profanity, profanity_variations in profanities.items():
85
+ if matched_words[max_score_index] == base_profanity:
86
+ matched_profanity[base_word] = base_profanity
87
+ break
88
+ if matched_words[max_score_index] in profanity_variations:
89
+ matched_profanity[base_word] = base_profanity
90
+ break
91
+
92
  return matched_profanity
93
 
94
 
95
+ def preprocess(tweet, profanities):
96
 
97
  tweet = tweet.lower()
98
  tweet = emoji.replace_emoji(tweet, replace='')
99
 
100
+ # Replace profanities
101
+ for base_word, matched_word in profanities.items():
102
+ tweet = tweet.replace(base_word, matched_word)
103
+
104
  # Elongated words conversion
105
  tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
106
 
 
133
 
134
  def predict(tweet):
135
 
136
+ profanities = fuzzy_lookup(tweet)
 
137
 
138
+ if len(profanities) > 0:
139
+
140
+ preprocessed_tweet = preprocess(tweet, profanities)
141
 
142
  prediction = query(preprocessed_tweet)
143
 
 
148
  prediction = prediction[0][0]["label"]
149
 
150
  print("\nTWEET:", tweet)
151
+ print("DETECTED PROFANITY:", list(profanities.keys()))
152
  print("LABEL:", prediction, "\n")
153
 
154
+ return prediction, list(profanities.keys())
155
 
156
  return "No Profanity", {}
157
 
 
174
  title="Tagalog Profanity Classifier"
175
  )
176
 
177
+ # demo.launch(debug=True)
178
+ tweet = "Tangaaa pala eh mamatay ka na pakyuuuu gag000 ul0l bob0 t4nginamo"
179
+
180
+ predict(tweet)