File size: 4,714 Bytes
34fbcfb
 
 
 
3172d47
 
 
bce56c0
114694a
40a4fcd
 
 
3172d47
34fbcfb
7a70c71
 
34fbcfb
91caef4
f108b87
 
91caef4
 
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f108b87
 
3172d47
bdff148
f108b87
bdff148
3172d47
33125f0
3172d47
6912dca
7a70c71
f108b87
3172d47
 
f108b87
 
 
bdff148
d9ea7b2
f108b87
 
 
 
 
 
 
7a70c71
 
114694a
7a70c71
93004e9
370f6d7
3172d47
 
370f6d7
eb2943c
3172d47
 
 
 
 
f108b87
3172d47
f108b87
3172d47
 
bce56c0
7a70c71
bce56c0
 
34fbcfb
201dfa5
bce56c0
34fbcfb
bce56c0
34fbcfb
 
 
 
 
 
 
 
7a70c71
34fbcfb
 
 
 
7a70c71
34fbcfb
7a70c71
f108b87
34fbcfb
 
 
7a70c71
34fbcfb
f108b87
34fbcfb
 
93004e9
bce56c0
91caef4
f108b87
 
7a70c71
f108b87
7a70c71
f108b87
201dfa5
f108b87
 
 
 
201dfa5
f108b87
 
 
 
 
16316d5
f108b87
 
 
fe9ff70
201dfa5
6c938dd
 
48392ea
6c938dd
34fbcfb
f108b87
34fbcfb
6c938dd
 
 
 
 
f108b87
 
 
 
6c938dd
34fbcfb
f108b87
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
from string import punctuation
import nltk
nltk.download('words')
from nltk.corpus import words


API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}

def query(text):
    payload = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
similar_words = read_text('similar_words')
addon_words = read_text('addon_words')
profanities = read_text('profanities', 'json')
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
eng_words = list(set(words.words()) - set(lookup_profanity))

# TODO check eng words that are tagalog profanities

def fuzzy_lookup(tweet):

    matched_profanity = []

    for word in tweet.split():

        base_word = word

        if word in eng_words:
            continue
        
        for addon in addon_words:
            if word.startswith(addon):
                word[len(addon):]
            if word.endswith(addon):
                word[:-len(addon)]

        scores = []
        matched_words = []
        word = word.strip(punctuation)
        processed_word = re.sub("[^a-zA-Z0-9@]", "", word)

        if len(processed_word) >= 4:
            # Get fuzzy ratio
            for lookup_word in lookup_words:
                score = fuzz.ratio(processed_word, lookup_word)
                if score >= 70:
                    scores.append(score)
                    matched_words.append(lookup_word)
            if len(scores) > 0:
                max_score_index = np.argmax(scores)
                if matched_words[max_score_index] in lookup_profanity:
                    matched_profanity.append(base_word)

    return matched_profanity


def preprocess(tweet):
    
    tweet = tweet.lower()
    tweet = emoji.replace_emoji(tweet, replace='')

    # Elongated words conversion
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    preprocessed_tweet = ' '.join(filter(None, row_split))

    if len(preprocessed_tweet.split()) == 1:
        return preprocessed_tweet

    # Expand Contractions
    for i in contractions.items():
        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

    return preprocessed_tweet



def predict(tweet):
    
    preprocessed_tweet = preprocess(tweet)
    matched_profanity = fuzzy_lookup(preprocessed_tweet)

    if len(matched_profanity) > 0:

        prediction = query(preprocessed_tweet)

        if type(prediction) == dict: 
            print(prediction)
            error_message = prediction['error']
            return error_message, [[]]
        
        prediction = prediction[0][0]["label"]
        
        print("\nTWEET:", tweet)
        print("DETECTED PROFANITY:", matched_profanity)
        print("LABEL:", prediction, "\n")

        return prediction, [matched_profanity]
    
    return "No Profanity", [[]]


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Text(label="PREDICTION"), gr.List(label="PROFANITIES")],

    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... 🤡',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

    allow_flagging="never",

    title="Tagalog Profanity Classifier"
)

demo.launch(debug=True)
predict("Tangina mo naman gag0 ka ba")