mginoben commited on
Commit
3172d47
β€’
1 Parent(s): 48392ea

Added match words list on output

Browse files
Files changed (1) hide show
  1. app.py +88 -66
app.py CHANGED
@@ -2,79 +2,99 @@ import gradio as gr
2
  import requests
3
  import emoji
4
  import re
 
 
 
 
5
 
6
  API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
7
  headers = {"Authorization": "Bearer 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"}
8
 
9
- profanities = ['bobo', 'bobong', 'bwiset', 'bwisit', 'buwisit', 'buwiset', 'bwesit', 'gago', 'gagong', 'kupal',
10
- 'pakshet', 'pakyu', 'pucha', 'puchang',
11
- 'punyeta', 'punyetang', 'puta', 'putang', 'putangina', 'putanginang', 'tanga', 'tangang', 'tangina',
12
- 'tanginang', 'tarantado', 'tarantadong', 'ulol']
13
-
14
- contractions = {
15
- 'di': 'hindi',
16
- 'to': 'ito',
17
- 'no': 'ano',
18
- 'kundi': 'kung hindi',
19
- 'nya': 'niya',
20
- 'nyo': 'ninyo',
21
- 'niyo': 'ninyo',
22
- 'pano': 'paano',
23
- 'sainyo': 'sa inyo',
24
- 'sayo': 'sa iyo',
25
- 'pag': 'kapag',
26
- 'kesa': 'kaysa',
27
- 'dun': 'doon',
28
- 'ganto': 'ganito',
29
- 'nandun': 'nandoon',
30
- 'saka': 'tsaka',
31
- 'ung': 'yung',
32
- 'wag': 'huwag',
33
- 'sya': 'siya',
34
- 'bat': 'bakit',
35
- 'yon': 'iyon',
36
- 'yun': 'iyon',
37
- 'dyan': 'diyan',
38
- 'jan': 'diyan',
39
- 'andito': 'nandito',
40
- 'tanginamo': 'tangina mo',
41
- 'putanginamo': 'putangina mo',
42
- 'san': 'saan',
43
- 'ganun': 'ganoon',
44
- 'gagong': 'gago na',
45
- 'bobong': 'bobo na',
46
- 'tangang': 'tanga na',
47
- 'kelan': 'kailan',
48
- 'raw': 'daw',
49
- 'tanginang': 'tangina na',
50
- 'tarantadong': 'tarantado na',
51
- 'putang ina': 'putangina',
52
- 'putang inang': 'putangina',
53
- 'putanginang': 'putangina',
54
- 'itong': 'ito ang',
55
- 'lng': 'lang',
56
- 'bwisit': 'bwiset',
57
- 'bwesit': 'bwiset',
58
- 'buwisit': 'bwiset',
59
- 'buwesit': 'bwiset'
60
- }
61
-
62
-
63
- def preprocess(row):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
65
  symbols = ['@', '#']
66
 
67
  # Lowercase
68
- row = row.lower()
69
 
70
  # Remove emojis
71
- row = emoji.replace_emoji(row, replace='')
72
 
73
  # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
74
- row = re.sub(r'(.)\1{2,}', r'\1', row)
75
 
76
  # Split sentence into list of words
77
- row_split = row.split()
78
 
79
  for index, word in enumerate(row_split):
80
 
@@ -117,11 +137,12 @@ def query(payload):
117
 
118
 
119
  def predict(text):
120
- output = query(preprocess(text))
121
- print(preprocess(text))
 
122
 
123
  if 'error' in output:
124
- return output['error'], 'Error occured. Try again later.'
125
  else:
126
  output = [tuple(i.values()) for i in output[0]]
127
  output = dict((x, y) for x, y in output)
@@ -136,11 +157,11 @@ def predict(text):
136
  for i in profanity:
137
  mask += "*" if i != " " else " "
138
  output_text = compiled.sub(mask, output_text)
139
- return output, output_text
140
  else:
141
- return output, text
142
-
143
 
 
144
 
145
 
146
  hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
@@ -152,7 +173,8 @@ demo = gr.Interface(
152
  inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
153
 
154
  outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
155
- gr.components.Text(label='OUTPUT')],
 
156
 
157
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😀 @davidrafael',
158
  'Napakainit ngayong araw pakshet namaaan!!',
 
2
  import requests
3
  import emoji
4
  import re
5
+ import json
6
+ from thefuzz import process, fuzz
7
+ import numpy as np
8
+
9
 
10
  API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
11
  headers = {"Authorization": "Bearer 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"}
12
 
13
+ profanities = ['bobo', 'bwiset','gago', 'kupal',
14
+ 'pakshet', 'pakyu', 'pucha',
15
+ 'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
16
+ 'tarantado', 'ulol']
17
+
18
+ def read_text(filename, filetype='txt'):
19
+ words = []
20
+
21
+ if filetype == 'txt':
22
+ with open(filename + '.txt') as file:
23
+ words = [line.rstrip() for line in file]
24
+ words = list(set(words))
25
+ elif filetype == 'json':
26
+ with open(filename + '.json') as json_file:
27
+ words = json.load(json_file)
28
+
29
+ return words
30
+
31
+
32
+ contractions = read_text('contractions', 'json')
33
+ lookup_words = read_text('lookup_words')
34
+ obj_pronouns = read_text('obj_pronouns')
35
+ profanities = read_text('profanities', 'json')
36
+
37
+
38
+ def fuzzyLookup(tweet):
39
+ lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
40
+ obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
41
+ matches = dict()
42
+
43
+ # Loop each word in tweet
44
+ for word in tweet.split():
45
+ scores = []
46
+ matched_words = []
47
+ # If word > 4 chars
48
+ if len(word) >= 4:
49
+ # Get fuzzy ratio
50
+ for lookup_word in lookup_words:
51
+ score = fuzz.ratio(word, lookup_word)
52
+ if score >= 65:
53
+ scores.append(score)
54
+ matched_words.append(lookup_word)
55
+ if len(scores) > 0:
56
+ max_score_index = np.argmax(scores)
57
+ if matched_words[max_score_index] in lookup_profanity:
58
+ matches[word] = matched_words[max_score_index]
59
+
60
+
61
+ for word, matched_profanity in matches.items():
62
+ word_split = word.split(matched_profanity[-2:])
63
+ for pronoun in obj_pronoun:
64
+ if len(word_split) > 1:
65
+ if pronoun == word_split[-1]:
66
+ matches[word] = matched_profanity + ' ' + pronoun
67
+ break
68
+
69
+ # Replace each profanities by fuzzy lookup result
70
+ for word, matched_profanity in matches.items():
71
+ tweet = tweet.replace(word, matched_profanity)
72
+
73
+ tweet_split = tweet.split()
74
+ for profanity, prof_varations in profanities.items():
75
+ for i, word in enumerate(tweet_split):
76
+ if word in prof_varations:
77
+ tweet_split[i] = profanity
78
+ tweet = ' '.join(tweet_split)
79
+
80
+ return tweet, json.dumps(matches)
81
+
82
+
83
+ def preprocess(text):
84
  laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
85
  symbols = ['@', '#']
86
 
87
  # Lowercase
88
+ text = text.lower()
89
 
90
  # Remove emojis
91
+ text = emoji.replace_emoji(text, replace='')
92
 
93
  # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
94
+ text = re.sub(r'(.)\1{2,}', r'\1', text)
95
 
96
  # Split sentence into list of words
97
+ row_split = text.split()
98
 
99
  for index, word in enumerate(row_split):
100
 
 
137
 
138
 
139
  def predict(text):
140
+ text= preprocess(text)
141
+ text, matches = fuzzyLookup(text)
142
+ output = query(text)
143
 
144
  if 'error' in output:
145
+ return output['error'], 'Error occured. Try again later.', {"error": "error"}
146
  else:
147
  output = [tuple(i.values()) for i in output[0]]
148
  output = dict((x, y) for x, y in output)
 
157
  for i in profanity:
158
  mask += "*" if i != " " else " "
159
  output_text = compiled.sub(mask, output_text)
160
+ return output, output_text, matches
161
  else:
162
+ return output, text, matches
 
163
 
164
+ # TODO gag0 not appearing
165
 
166
 
167
  hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
 
173
  inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
174
 
175
  outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
176
+ gr.components.Text(label='OUTPUT'),
177
+ gr.components.JSON()],
178
 
179
  examples=['Tangina mo naman sobrang yabang mo gago!!😠😀 @davidrafael',
180
  'Napakainit ngayong araw pakshet namaaan!!',