mginoben commited on
Commit
7a70c71
1 Parent(s): 21c119c

Reprogrammed app.

Browse files
Files changed (1) hide show
  1. app.py +84 -65
app.py CHANGED
@@ -9,8 +9,8 @@ import re
9
  from string import punctuation
10
 
11
 
12
- API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
13
- headers = {"Authorization": "Bearer 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"}
14
 
15
  def read_text(filename, filetype='txt'):
16
  words = []
@@ -31,20 +31,34 @@ lookup_words = read_text('lookup_words')
31
  obj_pronouns = read_text('obj_pronouns')
32
  profanities = read_text('profanities', 'json')
33
 
 
 
 
 
 
 
 
 
 
34
 
35
  def fuzzy_lookup(tweet):
 
 
 
 
36
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
37
- matches = dict()
38
 
39
  # Loop each word in tweet
40
  for word in tweet.split():
 
 
 
41
  # Remove punctuations
42
  word = word.strip(punctuation)
43
 
44
  # Only get digits and letters then lowercase
45
- processed_word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
46
- scores = []
47
- matched_words = []
48
  # If word > 4 chars
49
  if len(processed_word) >= 4:
50
  # Get fuzzy ratio
@@ -56,33 +70,30 @@ def fuzzy_lookup(tweet):
56
  if len(scores) > 0:
57
  max_score_index = np.argmax(scores)
58
  if matched_words[max_score_index] in lookup_profanity:
59
- matches[word] = matched_words[max_score_index]
60
-
61
 
62
- for word, matched_profanity in matches.items():
63
- word_split = word.split(matched_profanity[-2:])
64
  for pronoun in obj_pronouns:
65
  if len(word_split) > 1:
66
  if pronoun == word_split[-1]:
67
- matches[word] = matched_profanity + ' ' + pronoun
68
  break
69
 
70
  # Replace each profanities by fuzzy lookup result
71
- for word, matched_profanity in matches.items():
72
- tweet = tweet.replace(word, matched_profanity)
73
 
74
  for profanity, prof_varations in profanities.items():
75
  if len(prof_varations) > 0:
76
  for prof_variant in prof_varations:
77
  tweet = tweet.replace(prof_variant, profanity)
78
 
79
- return tweet, matches
80
 
81
 
82
  def preprocess(tweet):
83
- laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
84
- symbols = ['@', '#']
85
-
86
  # Lowercase
87
  tweet = tweet.lower()
88
 
@@ -97,71 +108,85 @@ def preprocess(tweet):
97
 
98
  for index, word in enumerate(row_split):
99
 
100
- # Remove words with symbols (e.g. @username, #hashtags)
101
- if any(x in word for x in symbols):
102
- row_split[index] = ''
103
-
104
  # Remove links
105
  if 'http' in word:
106
  row_split[index] = ''
107
 
108
  # Unify laugh texts format to 'haha'
 
109
  if any(x in word for x in laugh_texts):
110
  row_split[index] = 'haha'
111
 
112
  # Combine list of words back to sentence
113
- combined_text = ' '.join(filter(None, row_split))
114
 
115
  # Check if output contains single word then return null
116
- if len(combined_text.split()) == 1:
117
- return combined_text
118
-
119
- # Filter needed characters
120
- combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)
121
 
122
  # Expand Contractions
123
  for i in contractions.items():
124
- combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)
125
 
126
- return combined_text
 
127
 
128
-
129
- def query(payload):
130
- response = requests.post(API_URL, headers=headers, json=payload)
131
- return response.json()
132
 
133
 
134
  def predict(tweet):
 
 
 
 
 
 
 
135
 
136
- fuzzy_text, matches = fuzzy_lookup(tweet)
137
- processed_text = preprocess(fuzzy_text)
138
- output = query(processed_text)
 
 
 
 
 
 
139
 
 
140
 
141
- if 'error' in output:
142
- return output['error'], 'Error occured. Try again later.', {}
143
- elif len(matches) == 0:
144
- return 'No Profanity Found.', '', {}
145
- else:
146
- output = [tuple(i.values()) for i in output[0]]
147
- output = dict((x, y) for x, y in output)
148
- predicted_label = list(output.keys())[0]
149
 
150
- if predicted_label == 'Abusive':
151
- # Censor
152
- for base_word, _ in matches.items():
153
- mask = '*' * len(base_word)
154
- compiled = re.compile(re.escape(base_word), re.IGNORECASE)
155
- tweet = compiled.sub(mask, tweet)
156
- # tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
157
- return output, tweet, json.dumps(matches)
158
- else:
159
- return output, tweet, json.dumps(matches)
160
 
161
- # output, tweet, matches = predict('ul0L Sama ng ugali mo pre Tangina uL0l!!!')
162
- # print(output, '\n', tweet, '\n', matches)
 
 
 
 
 
 
163
 
164
- hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
 
167
  demo = gr.Interface(
@@ -169,19 +194,13 @@ demo = gr.Interface(
169
 
170
  inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
171
 
172
- outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
173
- gr.components.Text(label='OUTPUT'),
174
- gr.components.JSON(label='DETECTED PROFANITIES')],
175
 
176
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
177
  'Napakainit ngayong araw pakshet namaaan!!',
178
  'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
179
  'Bobo ka ba? napakadali lang nyan eh... 🤡',
180
  'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],
181
-
182
- allow_flagging="manual",
183
- flagging_callback=hf_writer,
184
- flagging_options=['Good bot', 'Bad bot']
185
  )
186
 
187
- demo.launch()
 
9
  from string import punctuation
10
 
11
 
12
+ API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
13
+ headers = {"Authorization": "Bearer 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"}
14
 
15
  def read_text(filename, filetype='txt'):
16
  words = []
 
31
  obj_pronouns = read_text('obj_pronouns')
32
  profanities = read_text('profanities', 'json')
33
 
34
+ def query(text):
35
+ text = {"inputs": text}
36
+ response = requests.post(API_URL, headers=headers, json=text)
37
+ return response.json()
38
+
39
+
40
+ # for profanity in profanities:
41
+ # print(profanity, process.extractOne(profanity, tweet.split(), scorer=fuzz.ratio))
42
+
43
 
44
  def fuzzy_lookup(tweet):
45
+
46
+ matched_profanity = dict()
47
+
48
+ # Convert Profanity Dict to List
49
  lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
 
50
 
51
  # Loop each word in tweet
52
  for word in tweet.split():
53
+ scores = []
54
+ matched_words = []
55
+
56
  # Remove punctuations
57
  word = word.strip(punctuation)
58
 
59
  # Only get digits and letters then lowercase
60
+ processed_word = re.sub("[^a-zA-Z0-9@]", "", word)
61
+
 
62
  # If word > 4 chars
63
  if len(processed_word) >= 4:
64
  # Get fuzzy ratio
 
70
  if len(scores) > 0:
71
  max_score_index = np.argmax(scores)
72
  if matched_words[max_score_index] in lookup_profanity:
73
+ matched_profanity[word] = matched_words[max_score_index]
 
74
 
75
+ for word, profanity in matched_profanity.items():
76
+ word_split = word.split(profanity[-2:])
77
  for pronoun in obj_pronouns:
78
  if len(word_split) > 1:
79
  if pronoun == word_split[-1]:
80
+ matched_profanity[word] = matched_profanity + ' ' + pronoun
81
  break
82
 
83
  # Replace each profanities by fuzzy lookup result
84
+ for word, profanity in matched_profanity.items():
85
+ tweet = tweet.replace(word, profanity)
86
 
87
  for profanity, prof_varations in profanities.items():
88
  if len(prof_varations) > 0:
89
  for prof_variant in prof_varations:
90
  tweet = tweet.replace(prof_variant, profanity)
91
 
92
+ return tweet, matched_profanity
93
 
94
 
95
  def preprocess(tweet):
96
+
 
 
97
  # Lowercase
98
  tweet = tweet.lower()
99
 
 
108
 
109
  for index, word in enumerate(row_split):
110
 
 
 
 
 
111
  # Remove links
112
  if 'http' in word:
113
  row_split[index] = ''
114
 
115
  # Unify laugh texts format to 'haha'
116
+ laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
117
  if any(x in word for x in laugh_texts):
118
  row_split[index] = 'haha'
119
 
120
  # Combine list of words back to sentence
121
+ preprocessed_tweet = ' '.join(filter(None, row_split))
122
 
123
  # Check if output contains single word then return null
124
+ if len(preprocessed_tweet.split()) == 1:
125
+ return preprocessed_tweet
 
 
 
126
 
127
  # Expand Contractions
128
  for i in contractions.items():
129
+ preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)
130
 
131
+ # Fuzzy Lookup
132
+ preprocessed_tweet, matches = fuzzy_lookup(preprocessed_tweet)
133
 
134
+ return preprocessed_tweet, matches
 
 
 
135
 
136
 
137
  def predict(tweet):
138
+
139
+ preprocessed_tweet, matched_profanity = preprocess(tweet)
140
+
141
+ prediction = query(preprocessed_tweet)
142
+
143
+ if type(prediction) is dict:
144
+ return "Model is still loading. Try again."
145
 
146
+ if bool(matched_profanity) == False:
147
+ return "No profanity found."
148
+
149
+ prediction = [tuple(i.values()) for i in prediction[0]]
150
+ prediction = dict((x, y) for x, y in prediction)
151
+
152
+ print("\n", tweet)
153
+ print(matched_profanity)
154
+ print(prediction, "\n")
155
 
156
+ return prediction
157
 
 
 
 
 
 
 
 
 
158
 
159
+ # # def predict(tweet):
160
+
161
+ # # fuzzy_text, matches = fuzzy_lookup(tweet)
162
+ # # processed_text = preprocess(fuzzy_text)
163
+ # # output = query(processed_text)
164
+
 
 
 
 
165
 
166
+ # # if 'error' in output:
167
+ # # return output['error'], 'Error occured. Try again later.', {}
168
+ # # elif len(matches) == 0:
169
+ # # return 'No Profanity Found.', '', {}
170
+ # # else:
171
+ # # output = [tuple(i.values()) for i in output[0]]
172
+ # # output = dict((x, y) for x, y in output)
173
+ # # predicted_label = list(output.keys())[0]
174
 
175
+ # # if predicted_label == 'Abusive':
176
+ # # # Censor
177
+ # # for base_word, _ in matches.items():
178
+ # # mask = '*' * len(base_word)
179
+ # # compiled = re.compile(re.escape(base_word), re.IGNORECASE)
180
+ # # tweet = compiled.sub(mask, tweet)
181
+ # # # tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
182
+ # # return output, tweet, json.dumps(matches)
183
+ # # else:
184
+ # # return output, tweet, json.dumps(matches)
185
+
186
+ # # # output, tweet, matches = predict('ul0L Sama ng ugali mo pre Tangina uL0l!!!')
187
+ # # # print(output, '\n', tweet, '\n', matches)
188
+
189
+ # # hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
190
 
191
 
192
  demo = gr.Interface(
 
194
 
195
  inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
196
 
197
+ outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION")],
 
 
198
 
199
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
200
  'Napakainit ngayong araw pakshet namaaan!!',
201
  'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
202
  'Bobo ka ba? napakadali lang nyan eh... 🤡',
203
  'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],
 
 
 
 
204
  )
205
 
206
+ demo.launch()