Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

File size: 5,122 Bytes

34fbcfb
 
 
 
3172d47
 
 
bce56c0
114694a
40a4fcd
 
 
3172d47
34fbcfb
7a70c71
 
34fbcfb
91caef4
 
 
 
 
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdff148
 
3172d47
33125f0
3172d47
6912dca
7a70c71
 
3172d47
 
bdff148
d9ea7b2
7a70c71
 
114694a
7a70c71
93004e9
370f6d7
3172d47
 
370f6d7
eb2943c
3172d47
 
 
 
 
7a70c71
3172d47
33125f0
7a70c71
 
6912dca
3172d47
 
91caef4
3172d47
 
 
7a70c71
 
3172d47
 
6912dca
 
 
3172d47
7a70c71
3172d47
 
bce56c0
7a70c71
bce56c0
 
34fbcfb
201dfa5
bce56c0
34fbcfb
bce56c0
34fbcfb
 
 
 
 
 
 
 
7a70c71
34fbcfb
 
 
 
7a70c71
34fbcfb
91caef4
 
 
7a70c71
91caef4
34fbcfb
 
 
7a70c71
34fbcfb
7a70c71
34fbcfb
 
93004e9
bce56c0
91caef4
7a70c71
 
 
 
201dfa5
5659c28
65003e6
 
201dfa5
bce56c0
7a70c71
201dfa5
 
7a70c71
 
 
 
91caef4
 
 
16316d5
7a70c71
fe9ff70
201dfa5
6c938dd
 
48392ea
6c938dd
34fbcfb
6c938dd
34fbcfb
6c938dd
 
 
 
 
 
34fbcfb
cef7bbe

import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
from string import punctuation
import nltk
nltk.download('words')
from nltk.corpus import words


API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}

def query(text):
    text = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=text)
    return response.json()

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
lookup_words = read_text('lookup_words')
obj_pronouns = read_text('obj_pronouns')
profanities = read_text('profanities', 'json')
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
eng_words = list(set(words.words()) - set(lookup_profanity))

# TODO check eng words that are tagalog profanities

def fuzzy_lookup(tweet):

    matched_profanity = dict()

    for word in tweet.split():
        if word in eng_words:
            continue
        scores = []
        matched_words = []
        word = word.strip(punctuation)
        processed_word = re.sub("[^a-zA-Z0-9@]", "", word)

        if len(processed_word) >= 4:
            # Get fuzzy ratio
            for lookup_word in lookup_words:
                score = fuzz.ratio(processed_word, lookup_word)
                if score >= 70:
                    scores.append(score)
                    matched_words.append(lookup_word)
            if len(scores) > 0:
                max_score_index = np.argmax(scores)
                if matched_words[max_score_index] in lookup_profanity:
                    matched_profanity[word] = matched_words[max_score_index]

    # Expand Pronouns in Profanities
    for word, profanity in matched_profanity.items():
        word_split = word.split(profanity[-2:])
        for pronoun in obj_pronouns:
            if len(word_split) > 1:
                if pronoun == word_split[-1]:
                    matched_profanity[word] = profanity + ' ' + pronoun
                    break

    # Replace each profanities by fuzzy lookup result
    for word, profanity in matched_profanity.items():
        tweet = tweet.replace(word, profanity)

    for profanity, prof_varations in profanities.items():
        if len(prof_varations) > 0:
            for prof_variant in prof_varations:
                tweet = tweet.replace(prof_variant, profanity)

    return tweet, matched_profanity


def preprocess(tweet):
    
    tweet = tweet.lower()
    tweet = emoji.replace_emoji(tweet, replace='')

    # Elongated words conversion
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    preprocessed_tweet = ' '.join(filter(None, row_split))

    # Fuzzy Lookup
    preprocessed_tweet, matches = fuzzy_lookup(preprocessed_tweet)

    if len(preprocessed_tweet.split()) == 1:
        return preprocessed_tweet, matches

    # Expand Contractions
    for i in contractions.items():
        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

    return preprocessed_tweet, matches 



def predict(tweet):
    
    preprocessed_tweet, matched_profanity = preprocess(tweet)

    prediction = query(preprocessed_tweet)

    if type(prediction) == dict: 
        print(prediction)
        error_message = prediction['error']
        return error_message

    
    if bool(matched_profanity) == False:
        return "No Profanity"
        
    
    prediction = [tuple(i.values()) for i in prediction[0]]
    prediction = dict((x, y) for x, y in prediction)
    
    print("\nTWEET:", tweet)
    print("DETECTED PROFANITY:", matched_profanity)
    print("LABELS:", prediction, "\n")

    return prediction


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION")],

    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... 🤡',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],
)

demo.launch(debug=True)