File size: 5,996 Bytes
34fbcfb
 
 
 
3172d47
 
 
bce56c0
40a4fcd
bf5fae7
3172d47
34fbcfb
7a70c71
 
34fbcfb
91caef4
f108b87
 
91caef4
 
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f108b87
 
bf5fae7
 
 
 
8adc428
3172d47
6912dca
7a70c71
fa21182
3172d47
8adc428
f108b87
bf5fae7
 
 
 
 
 
 
 
8adc428
bf5fae7
 
8adc428
bf5fae7
 
fa21182
bf5fae7
 
 
 
 
 
f108b87
 
8adc428
f108b87
8adc428
 
bf5fae7
 
f108b87
bf5fae7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93004e9
bf5fae7
 
fa21182
f108b87
3172d47
 
fa21182
7a70c71
bce56c0
 
34fbcfb
fa21182
 
 
 
201dfa5
bce56c0
34fbcfb
bce56c0
34fbcfb
 
 
 
 
 
 
 
7a70c71
34fbcfb
 
 
 
7a70c71
34fbcfb
7a70c71
f108b87
34fbcfb
 
 
7a70c71
34fbcfb
f108b87
34fbcfb
 
93004e9
bce56c0
91caef4
fa21182
7a70c71
fa21182
 
 
7a70c71
f108b87
201dfa5
f108b87
 
 
6cbea5c
bf5fae7
f108b87
 
 
bf5fae7
fa21182
f108b87
16316d5
fa21182
f108b87
6cbea5c
fe9ff70
201dfa5
6c938dd
 
48392ea
6c938dd
34fbcfb
6cbea5c
34fbcfb
6c938dd
 
 
 
 
f108b87
 
 
 
6c938dd
34fbcfb
58eeaa0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
import nltk
from english_words import get_english_words_set


API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}

def query(text):
    payload = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
similar_words = read_text('similar_words')
addon_words = read_text('addon_words')
profanities_dict = read_text('profanities', 'json')
lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
lookup_words = list(set(similar_words).union(set(lookup_profanity)))
eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')

def fuzzy_lookup(tweet):

    matched_profanity = dict()

    for word in tweet.split():

        if word in eng_words:
            continue

        scores = []
        matched_words = []
        matched_word = None

        # Remove trailing punctuations except # and @
        word = punctuations.sub('', word).lower()

        # Save base word
        base_word =  word

        # Shortent elongated word
        word = re.sub(r'(.)\1{2,}', r'\1', word)

        # Remove # and @
        if word.startswith("#") or word.startswith("@"):
            word = word[1:]

        # Remove trailing words (mo, ka, pinaka)
        for addon in addon_words:
            if word.startswith(addon):
                word = word[len(addon):]
            if word.endswith(addon):
                word = word[:-len(addon)]

        if len(word) < 4:
            continue

        # Get fuzzy ratio
        for lookup_word in lookup_words:

            score = fuzz.ratio(word, lookup_word)

            # Threshold
            if score >= 70:
                scores.append(score)
                matched_words.append(lookup_word)

        if len(scores) == 0:
            continue
            
        if len(set(scores)) == 1:
            for matched_word in matched_words:
                if matched_word in lookup_profanity:
                    matched_word = matched_word
                    break
        else:
            # Get matched word with max score
            max_score_index = np.argmax(scores)
            matched_word = matched_words[max_score_index]

        if matched_word not in lookup_profanity:
            continue

        for base_profanity, profanity_variations in profanities_dict.items():

            if matched_word in profanity_variations or matched_word == base_profanity:

                # Seperate pronouns
                for addon in addon_words:
                    if base_word.endswith(addon):
                        base_profanity = base_profanity + " " + addon
                        break

                matched_profanity[base_word] = base_profanity
                break
                        
    return matched_profanity


def preprocess(tweet, profanities):
    
    tweet = tweet.lower()
    tweet = emoji.replace_emoji(tweet, replace='')

    # Replace profanities
    for base_word, matched_word in profanities.items():
        tweet = tweet.replace(base_word, matched_word)

    # Elongated words conversion
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    preprocessed_tweet = ' '.join(filter(None, row_split))

    if len(preprocessed_tweet.split()) == 1:
        return preprocessed_tweet

    # Expand Contractions
    for i in contractions.items():
        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

    return preprocessed_tweet



def predict(tweet):
    
    profanities = fuzzy_lookup(tweet)

    if len(profanities) > 0:

        preprocessed_tweet = preprocess(tweet, profanities)

        prediction = query(preprocessed_tweet)

        if type(prediction) == dict: 
            print(prediction)
            error_message = prediction['error']
            return error_message, {}

        prediction = prediction[0][0]["label"]
        
        print("\nTWEET:", tweet)
        print("PROCESSED TWEET:", preprocessed_tweet)
        print("DETECTED PROFANITY:", list(profanities.keys()))
        print("LABEL:", prediction, "\n")

        return prediction, list(profanities.keys())
    
    return "No Profanity", {}


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")],

    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... 🤡',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

    allow_flagging="never",

    title="Tagalog Profanity Classifier"
)

demo.launch(debug=True)