mginoben commited on
Commit
bce56c0
β€’
1 Parent(s): 72286e6

Fixed word lookup including emojis

Browse files
Files changed (1) hide show
  1. app.py +21 -28
app.py CHANGED
@@ -5,16 +5,12 @@ import re
5
  import json
6
  from thefuzz import process, fuzz
7
  import numpy as np
 
8
 
9
 
10
  API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
11
  headers = {"Authorization": "Bearer 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"}
12
 
13
- profanities = ['bobo', 'bwiset','gago', 'kupal',
14
- 'pakshet', 'pakyu', 'pucha',
15
- 'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
16
- 'tarantado', 'ulol']
17
-
18
  def read_text(filename, filetype='txt'):
19
  words = []
20
 
@@ -42,6 +38,8 @@ def fuzzyLookup(tweet):
42
 
43
  # Loop each word in tweet
44
  for word in tweet.split():
 
 
45
  scores = []
46
  matched_words = []
47
  # If word > 4 chars
@@ -77,24 +75,24 @@ def fuzzyLookup(tweet):
77
  tweet_split[i] = profanity
78
  tweet = ' '.join(tweet_split)
79
 
80
- return tweet, json.dumps(matches)
81
 
82
 
83
- def preprocess(text):
84
  laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
85
  symbols = ['@', '#']
86
 
87
  # Lowercase
88
- text = text.lower()
89
 
90
  # Remove emojis
91
- text = emoji.replace_emoji(text, replace='')
92
 
93
  # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
94
- text = re.sub(r'(.)\1{2,}', r'\1', text)
95
 
96
  # Split sentence into list of words
97
- row_split = text.split()
98
 
99
  for index, word in enumerate(row_split):
100
 
@@ -136,32 +134,27 @@ def query(payload):
136
  return response.json()
137
 
138
 
139
- def predict(text):
140
- text= preprocess(text)
141
- text, matches = fuzzyLookup(text)
142
- output = query(text)
 
143
 
144
  if 'error' in output:
145
  return output['error'], 'Error occured. Try again later.', {"error": "error"}
146
  else:
147
  output = [tuple(i.values()) for i in output[0]]
148
  output = dict((x, y) for x, y in output)
149
-
150
  predicted_label = list(output.keys())[0]
151
 
152
  if predicted_label == 'Abusive':
153
- output_text = text
154
- for profanity in profanities:
155
- compiled = re.compile(re.escape(profanity), re.IGNORECASE)
156
- mask = ""
157
- for i in profanity:
158
- mask += "*" if i != " " else " "
159
- output_text = compiled.sub(mask, output_text)
160
- return output, output_text, matches
161
  else:
162
- return output, text, matches
163
-
164
- # TODO gag0 not appearing
165
 
166
 
167
  hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
@@ -174,7 +167,7 @@ demo = gr.Interface(
174
 
175
  outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
176
  gr.components.Text(label='OUTPUT'),
177
- gr.components.JSON()],
178
 
179
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😀 @davidrafael',
180
  'Napakainit ngayong araw pakshet namaaan!!',
 
5
  import json
6
  from thefuzz import process, fuzz
7
  import numpy as np
8
+ import re
9
 
10
 
11
  API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
12
  headers = {"Authorization": "Bearer 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"}
13
 
 
 
 
 
 
14
  def read_text(filename, filetype='txt'):
15
  words = []
16
 
 
38
 
39
  # Loop each word in tweet
40
  for word in tweet.split():
41
+ # Only get digits and letters
42
+ word = re.sub("[^a-zA-Z0-9@]", "", word)
43
  scores = []
44
  matched_words = []
45
  # If word > 4 chars
 
75
  tweet_split[i] = profanity
76
  tweet = ' '.join(tweet_split)
77
 
78
+ return tweet, matches
79
 
80
 
81
+ def preprocess(tweet):
82
  laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
83
  symbols = ['@', '#']
84
 
85
  # Lowercase
86
+ tweet = tweet.lower()
87
 
88
  # Remove emojis
89
+ tweet = emoji.replace_emoji(tweet, replace='')
90
 
91
  # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
92
+ tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
93
 
94
  # Split sentence into list of words
95
+ row_split = tweet.split()
96
 
97
  for index, word in enumerate(row_split):
98
 
 
134
  return response.json()
135
 
136
 
137
+ def predict(tweet):
138
+
139
+ fuzz_text, matches = fuzzyLookup(tweet)
140
+ processed_text = preprocess(fuzz_text)
141
+ output = query(processed_text)
142
 
143
  if 'error' in output:
144
  return output['error'], 'Error occured. Try again later.', {"error": "error"}
145
  else:
146
  output = [tuple(i.values()) for i in output[0]]
147
  output = dict((x, y) for x, y in output)
 
148
  predicted_label = list(output.keys())[0]
149
 
150
  if predicted_label == 'Abusive':
151
+ for base_word, _ in matches.items():
152
+ tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
153
+
154
+ return output, tweet, json.dumps(matches)
 
 
 
 
155
  else:
156
+ return output, tweet, json.dumps(matches)
157
+
 
158
 
159
 
160
  hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
 
167
 
168
  outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
169
  gr.components.Text(label='OUTPUT'),
170
+ gr.components.JSON(label='DETECTED PROFANITIES')],
171
 
172
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😀 @davidrafael',
173
  'Napakainit ngayong araw pakshet namaaan!!',