mginoben commited on
Commit
6912dca
1 Parent(s): 59f8d0b

Fixed disappearing profanities

Browse files
Files changed (2) hide show
  1. app.py +9 -17
  2. lookup_words.txt +1 -0
app.py CHANGED
@@ -31,9 +31,8 @@ obj_pronouns = read_text('obj_pronouns')
31
  profanities = read_text('profanities', 'json')
32
 
33
 
34
- def fuzzyLookup(tweet):
35
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
36
- obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
37
  matches = dict()
38
 
39
  # Loop each word in tweet
@@ -58,7 +57,7 @@ def fuzzyLookup(tweet):
58
 
59
  for word, matched_profanity in matches.items():
60
  word_split = word.split(matched_profanity[-2:])
61
- for pronoun in obj_pronoun:
62
  if len(word_split) > 1:
63
  if pronoun == word_split[-1]:
64
  matches[word] = matched_profanity + ' ' + pronoun
@@ -68,13 +67,12 @@ def fuzzyLookup(tweet):
68
  for word, matched_profanity in matches.items():
69
  tweet = tweet.replace(word, matched_profanity)
70
 
71
- tweet_split = tweet.split()
72
  for profanity, prof_varations in profanities.items():
73
- for i, word in enumerate(tweet_split):
74
- if word in prof_varations:
75
- tweet_split[i] = profanity
76
- tweet = ' '.join(tweet_split)
77
 
 
78
  return tweet, matches
79
 
80
 
@@ -108,10 +106,6 @@ def preprocess(tweet):
108
  if any(x in word for x in laugh_texts):
109
  row_split[index] = 'haha'
110
 
111
- # Remove words with digits (4ever)
112
- if any(x.isdigit() for x in word):
113
- row_split[index] = ''
114
-
115
  # Combine list of words back to sentence
116
  combined_text = ' '.join(filter(None, row_split))
117
 
@@ -136,9 +130,8 @@ def query(payload):
136
 
137
  def predict(tweet):
138
 
139
- fuzz_text, matches = fuzzyLookup(tweet)
140
- processed_text = preprocess(fuzz_text)
141
- output = query(processed_text)
142
 
143
  if 'error' in output:
144
  return output['error'], 'Error occured. Try again later.', {"error": "error"}
@@ -149,14 +142,13 @@ def predict(tweet):
149
 
150
  if predicted_label == 'Abusive':
151
  for base_word, _ in matches.items():
 
152
  tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
153
 
154
  return output, tweet, json.dumps(matches)
155
  else:
156
  return output, tweet, json.dumps(matches)
157
 
158
-
159
-
160
  hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
161
 
162
 
 
31
  profanities = read_text('profanities', 'json')
32
 
33
 
34
+ def fuzzy_lookup(tweet):
35
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
 
36
  matches = dict()
37
 
38
  # Loop each word in tweet
 
57
 
58
  for word, matched_profanity in matches.items():
59
  word_split = word.split(matched_profanity[-2:])
60
+ for pronoun in obj_pronouns:
61
  if len(word_split) > 1:
62
  if pronoun == word_split[-1]:
63
  matches[word] = matched_profanity + ' ' + pronoun
 
67
  for word, matched_profanity in matches.items():
68
  tweet = tweet.replace(word, matched_profanity)
69
 
 
70
  for profanity, prof_varations in profanities.items():
71
+ if len(prof_varations) > 0:
72
+ for prof_variant in prof_varations:
73
+ tweet = tweet.replace(prof_variant, profanity)
 
74
 
75
+ print('Fuzzy Returns:', tweet)
76
  return tweet, matches
77
 
78
 
 
106
  if any(x in word for x in laugh_texts):
107
  row_split[index] = 'haha'
108
 
 
 
 
 
109
  # Combine list of words back to sentence
110
  combined_text = ' '.join(filter(None, row_split))
111
 
 
130
 
131
  def predict(tweet):
132
 
133
+ fuzzy_text, matches = fuzzy_lookup(tweet)
134
+ output = query(preprocess(fuzzy_text))
 
135
 
136
  if 'error' in output:
137
  return output['error'], 'Error occured. Try again later.', {"error": "error"}
 
142
 
143
  if predicted_label == 'Abusive':
144
  for base_word, _ in matches.items():
145
+
146
  tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
147
 
148
  return output, tweet, json.dumps(matches)
149
  else:
150
  return output, tweet, json.dumps(matches)
151
 
 
 
152
  hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
153
 
154
 
lookup_words.txt CHANGED
@@ -152,4 +152,5 @@ kang
152
  bubuka
153
  buka
154
  talaga
 
155
  g@g0
 
152
  bubuka
153
  buka
154
  talaga
155
+ tuloy
156
  g@g0