mginoben commited on
Commit
f108b87
β€’
1 Parent(s): 30b21ca

Modified reverted changes

Browse files
obj_pronouns.txt β†’ addon_words.txt RENAMED
@@ -7,4 +7,8 @@ ninyo
7
  nila
8
  ka
9
  nyo
10
- ng
 
 
 
 
 
7
  nila
8
  ka
9
  nyo
10
+ ng
11
+ an
12
+ am
13
+ napaka
14
+ paka
app.py CHANGED
@@ -16,8 +16,8 @@ API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-pro
16
  headers = {"Authorization": "Bearer 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"}
17
 
18
  def query(text):
19
- text = {"inputs": text}
20
- response = requests.post(API_URL, headers=headers, json=text)
21
  return response.json()
22
 
23
  def read_text(filename, filetype='txt'):
@@ -35,21 +35,32 @@ def read_text(filename, filetype='txt'):
35
 
36
 
37
  contractions = read_text('contractions', 'json')
38
- lookup_words = read_text('lookup_words')
39
- obj_pronouns = read_text('obj_pronouns')
40
  profanities = read_text('profanities', 'json')
41
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
 
42
  eng_words = list(set(words.words()) - set(lookup_profanity))
43
 
44
  # TODO check eng words that are tagalog profanities
45
 
46
  def fuzzy_lookup(tweet):
47
 
48
- matched_profanity = dict()
49
 
50
  for word in tweet.split():
 
 
 
51
  if word in eng_words:
52
  continue
 
 
 
 
 
 
 
53
  scores = []
54
  matched_words = []
55
  word = word.strip(punctuation)
@@ -65,27 +76,9 @@ def fuzzy_lookup(tweet):
65
  if len(scores) > 0:
66
  max_score_index = np.argmax(scores)
67
  if matched_words[max_score_index] in lookup_profanity:
68
- matched_profanity[word] = matched_words[max_score_index]
69
 
70
- # Expand Pronouns in Profanities
71
- for word, profanity in matched_profanity.items():
72
- word_split = word.split(profanity[-2:])
73
- for pronoun in obj_pronouns:
74
- if len(word_split) > 1:
75
- if pronoun == word_split[-1]:
76
- matched_profanity[word] = profanity + ' ' + pronoun
77
- break
78
-
79
- # Replace each profanities by fuzzy lookup result
80
- for word, profanity in matched_profanity.items():
81
- tweet = tweet.replace(word, profanity)
82
-
83
- for profanity, prof_varations in profanities.items():
84
- if len(prof_varations) > 0:
85
- for prof_variant in prof_varations:
86
- tweet = tweet.replace(prof_variant, profanity)
87
-
88
- return tweet, matched_profanity
89
 
90
 
91
  def preprocess(tweet):
@@ -112,44 +105,40 @@ def preprocess(tweet):
112
  # Combine list of words back to sentence
113
  preprocessed_tweet = ' '.join(filter(None, row_split))
114
 
115
- # Fuzzy Lookup
116
- preprocessed_tweet, matches = fuzzy_lookup(preprocessed_tweet)
117
-
118
  if len(preprocessed_tweet.split()) == 1:
119
- return preprocessed_tweet, matches
120
 
121
  # Expand Contractions
122
  for i in contractions.items():
123
  preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)
124
 
125
- return preprocessed_tweet, matches
126
 
127
 
128
 
129
  def predict(tweet):
130
 
131
- preprocessed_tweet, matched_profanity = preprocess(tweet)
 
132
 
133
- prediction = query(preprocessed_tweet)
134
 
135
- if type(prediction) == dict:
136
- print(prediction)
137
- error_message = prediction['error']
138
- return error_message
139
 
140
-
141
- if bool(matched_profanity) == False:
142
- return "No Profanity"
 
143
 
144
-
145
- prediction = [tuple(i.values()) for i in prediction[0]]
146
- prediction = dict((x, y) for x, y in prediction)
147
-
148
- print("\nTWEET:", tweet)
149
- print("DETECTED PROFANITY:", matched_profanity)
150
- print("LABELS:", prediction, "\n")
151
 
152
- return prediction
 
 
153
 
154
 
155
  demo = gr.Interface(
@@ -157,13 +146,18 @@ demo = gr.Interface(
157
 
158
  inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
159
 
160
- outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION")],
161
 
162
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😀 @davidrafael',
163
  'Napakainit ngayong araw pakshet namaaan!!',
164
  'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
165
  'Bobo ka ba? napakadali lang nyan eh... 🀑',
166
  'Uy gago laptrip yung nangyare samen kanina HAHAHAπŸ˜‚πŸ˜‚'],
 
 
 
 
167
  )
168
 
169
- demo.launch(debug=True)
 
 
16
  headers = {"Authorization": "Bearer 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"}
17
 
18
  def query(text):
19
+ payload = {"inputs": text}
20
+ response = requests.post(API_URL, headers=headers, json=payload)
21
  return response.json()
22
 
23
  def read_text(filename, filetype='txt'):
 
35
 
36
 
37
  contractions = read_text('contractions', 'json')
38
+ similar_words = read_text('similar_words')
39
+ addon_words = read_text('addon_words')
40
  profanities = read_text('profanities', 'json')
41
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
42
+ lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
43
  eng_words = list(set(words.words()) - set(lookup_profanity))
44
 
45
  # TODO check eng words that are tagalog profanities
46
 
47
  def fuzzy_lookup(tweet):
48
 
49
+ matched_profanity = []
50
 
51
  for word in tweet.split():
52
+
53
+ base_word = word
54
+
55
  if word in eng_words:
56
  continue
57
+
58
+ for addon in addon_words:
59
+ if word.startswith(addon):
60
+ word[len(addon):]
61
+ if word.endswith(addon):
62
+ word[:-len(addon)]
63
+
64
  scores = []
65
  matched_words = []
66
  word = word.strip(punctuation)
 
76
  if len(scores) > 0:
77
  max_score_index = np.argmax(scores)
78
  if matched_words[max_score_index] in lookup_profanity:
79
+ matched_profanity.append(base_word)
80
 
81
+ return matched_profanity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
  def preprocess(tweet):
 
105
  # Combine list of words back to sentence
106
  preprocessed_tweet = ' '.join(filter(None, row_split))
107
 
 
 
 
108
  if len(preprocessed_tweet.split()) == 1:
109
+ return preprocessed_tweet
110
 
111
  # Expand Contractions
112
  for i in contractions.items():
113
  preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)
114
 
115
+ return preprocessed_tweet
116
 
117
 
118
 
119
  def predict(tweet):
120
 
121
+ preprocessed_tweet = preprocess(tweet)
122
+ matched_profanity = fuzzy_lookup(preprocessed_tweet)
123
 
124
+ if len(matched_profanity) > 0:
125
 
126
+ prediction = query(preprocessed_tweet)
 
 
 
127
 
128
+ if type(prediction) == dict:
129
+ print(prediction)
130
+ error_message = prediction['error']
131
+ return error_message, [[]]
132
 
133
+ prediction = prediction[0][0]["label"]
134
+
135
+ print("\nTWEET:", tweet)
136
+ print("DETECTED PROFANITY:", matched_profanity)
137
+ print("LABEL:", prediction, "\n")
 
 
138
 
139
+ return prediction, [matched_profanity]
140
+
141
+ return "No Profanity", [[]]
142
 
143
 
144
  demo = gr.Interface(
 
146
 
147
  inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
148
 
149
+ outputs=[gr.components.Text(label="PREDICTION"), gr.List(label="PROFANITIES")],
150
 
151
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😀 @davidrafael',
152
  'Napakainit ngayong araw pakshet namaaan!!',
153
  'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
154
  'Bobo ka ba? napakadali lang nyan eh... 🀑',
155
  'Uy gago laptrip yung nangyare samen kanina HAHAHAπŸ˜‚πŸ˜‚'],
156
+
157
+ allow_flagging="never",
158
+
159
+ title="Tagalog Profanity Classifier"
160
  )
161
 
162
+ demo.launch(debug=True)
163
+ predict("Tangina mo naman gag0 ka ba")
profanities.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "bobo": ["boboka", "b0b0"],
3
- "bwiset": ["buwesit", "buwiset"],
4
- "gago": ["gaga", "g@g0", "ginago"],
5
  "kupal": [],
6
- "pakshet": [],
7
  "pakyu": [],
8
  "pucha": [],
9
  "punyeta": [],
10
  "puta": ["pota"],
11
  "putangina": ["pukingina", "kinangina", "putang"],
12
  "tanga": [],
13
- "tangina": ["tangna", "inamo"],
14
- "tarantado": ["tinarantado", "t@r@nt@d0"],
15
  "ulol": ["ul0l", "olol", "0lol"]
16
  }
 
1
  {
2
+ "bobo": ["bobobo", "b0b0"],
3
+ "bwiset": ["buwesit", "buwiset", "bwisit"],
4
+ "gago": ["gaga", "g@g0"],
5
  "kupal": [],
6
+ "pakshet": ["pakshit"],
7
  "pakyu": [],
8
  "pucha": [],
9
  "punyeta": [],
10
  "puta": ["pota"],
11
  "putangina": ["pukingina", "kinangina", "putang"],
12
  "tanga": [],
13
+ "tangina": ["tangna", "inamo", "tatanga"],
14
+ "tarantado": ["t4r4nt4do", "t@r@nt@do"],
15
  "ulol": ["ul0l", "olol", "0lol"]
16
  }
lookup_words.txt β†’ similar_words.txt RENAMED
File without changes