mginoben commited on
Commit
34fbcfb
1 Parent(s): 595931f

Add application file

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import requests
4
+ import emoji
5
+ import re
6
+
7
+ API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
8
+ headers = {"Authorization": "Bearer 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"}
9
+
10
+ profanities = ['bobo', 'bobong', 'bwiset', 'bwisit', 'buwisit', 'buwiset', 'bwesit', 'gago', 'gagong', 'kupal',
11
+ 'pakshet', 'pakyu', 'pucha', 'puchang',
12
+ 'punyeta', 'punyetang', 'puta', 'putang', 'putangina', 'putanginang', 'tanga', 'tangang', 'tangina',
13
+ 'tanginang', 'tarantado', 'tarantadong', 'ulol']
14
+
15
+ contractions = {
16
+ 'di': 'hindi',
17
+ 'to': 'ito',
18
+ 'no': 'ano',
19
+ 'kundi': 'kung hindi',
20
+ 'nya': 'niya',
21
+ 'nyo': 'ninyo',
22
+ 'niyo': 'ninyo',
23
+ 'pano': 'paano',
24
+ 'sainyo': 'sa inyo',
25
+ 'sayo': 'sa iyo',
26
+ 'pag': 'kapag',
27
+ 'kesa': 'kaysa',
28
+ 'dun': 'doon',
29
+ 'ganto': 'ganito',
30
+ 'nandun': 'nandoon',
31
+ 'saka': 'tsaka',
32
+ 'ung': 'yung',
33
+ 'wag': 'huwag',
34
+ 'sya': 'siya',
35
+ 'bat': 'bakit',
36
+ 'yon': 'iyon',
37
+ 'yun': 'iyon',
38
+ 'dyan': 'diyan',
39
+ 'jan': 'diyan',
40
+ 'andito': 'nandito',
41
+ 'tanginamo': 'tangina mo',
42
+ 'putanginamo': 'putangina mo',
43
+ 'san': 'saan',
44
+ 'ganun': 'ganoon',
45
+ 'gagong': 'gago na',
46
+ 'bobong': 'bobo na',
47
+ 'tangang': 'tanga na',
48
+ 'kelan': 'kailan',
49
+ 'raw': 'daw',
50
+ 'tanginang': 'tangina na',
51
+ 'tarantadong': 'tarantado na',
52
+ 'putang ina': 'putangina',
53
+ 'putang inang': 'putangina',
54
+ 'putanginang': 'putangina',
55
+ 'itong': 'ito ang',
56
+ 'lng': 'lang',
57
+ 'bwisit': 'bwiset',
58
+ 'bwesit': 'bwiset',
59
+ 'buwisit': 'bwiset',
60
+ 'buwesit': 'bwiset'
61
+ }
62
+
63
+
64
+ def preprocess(row):
65
+ laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
66
+ symbols = ['@', '#']
67
+
68
+ # Lowercase
69
+ row = row.lower()
70
+
71
+ # Remove emojis
72
+ row = emoji.replace_emoji(row, replace='')
73
+
74
+ # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
75
+ row = re.sub(r'(.)\1{2,}', r'\1', row)
76
+
77
+ # Split sentence into list of words
78
+ row_split = row.split()
79
+
80
+ for index, word in enumerate(row_split):
81
+
82
+ # Remove words with symbols (e.g. @username, #hashtags)
83
+ if any(x in word for x in symbols):
84
+ row_split[index] = ''
85
+
86
+ # Remove links
87
+ if 'http' in word:
88
+ row_split[index] = ''
89
+
90
+ # Unify laugh texts format to 'haha'
91
+ if any(x in word for x in laugh_texts):
92
+ row_split[index] = 'haha'
93
+
94
+ # Remove words with digits (4ever)
95
+ if any(x.isdigit() for x in word):
96
+ row_split[index] = ''
97
+
98
+ # Combine list of words back to sentence
99
+ combined_text = ' '.join(filter(None, row_split))
100
+
101
+ # Check if output contains single word then return null
102
+ if len(combined_text.split()) == 1:
103
+ return combined_text
104
+
105
+ # Filter needed characters
106
+ combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)
107
+
108
+ # Expand Contractions
109
+ for i in contractions.items():
110
+ combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)
111
+
112
+ return combined_text
113
+
114
+
115
+ def query(payload):
116
+ response = requests.post(API_URL, headers=headers, json=payload)
117
+ return response.json()
118
+
119
+
120
+ def predict(text):
121
+ print(preprocess(text))
122
+ output = query(preprocess(text))[0]
123
+ print(output)
124
+ output = [tuple(i.values()) for i in output]
125
+ output = dict((x, y) for x, y in output)
126
+
127
+ predicted_label = list(output.keys())[0]
128
+
129
+ if predicted_label == 'Abusive':
130
+ output_text = text
131
+ for i in profanities:
132
+ compiled = re.compile(re.escape(i), re.IGNORECASE)
133
+ output_text = compiled.sub('****', output_text)
134
+ return output, output_text
135
+ else:
136
+ return output, text
137
+
138
+
139
+ hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
140
+
141
+
142
+ demo = gr.Interface(
143
+ fn=predict,
144
+
145
+ inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
146
+
147
+ outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
148
+ gr.components.Text(label='OUTPUT')],
149
+
150
+ examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
151
+ 'Napakainit ngayong araw pakshet namaaan!!',
152
+ 'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
153
+ 'Bobo ka ba? napakadali lang nyan eh... 🤡',
154
+ 'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],
155
+
156
+ allow_flagging="manual",
157
+ flagging_callback=hf_writer,
158
+ flagging_options=['Good bot', 'Bad bot']
159
+ )
160
+
161
+ demo.launch(debug=True)