Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

File size: 4,714 Bytes

34fbcfb
 
 
 
3172d47
 
 
bce56c0
114694a
40a4fcd
 
 
3172d47
34fbcfb
7a70c71
 
34fbcfb
91caef4
f108b87
 
91caef4
 
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f108b87
 
3172d47
bdff148
f108b87
bdff148
3172d47
33125f0
3172d47
6912dca
7a70c71
f108b87
3172d47
 
f108b87
 
 
bdff148
d9ea7b2
f108b87
 
 
 
 
 
 
7a70c71
 
114694a
7a70c71
93004e9
370f6d7
3172d47
 
370f6d7
eb2943c
3172d47
 
 
 
 
f108b87
3172d47
f108b87
3172d47
 
bce56c0
7a70c71
bce56c0
 
34fbcfb
201dfa5
bce56c0
34fbcfb
bce56c0
34fbcfb
 
 
 
 
 
 
 
7a70c71
34fbcfb
 
 
 
7a70c71
34fbcfb
7a70c71
f108b87
34fbcfb
 
 
7a70c71
34fbcfb
f108b87
34fbcfb
 
93004e9
bce56c0
91caef4
f108b87
 
7a70c71
f108b87
7a70c71
f108b87
201dfa5
f108b87
 
 
 
201dfa5
f108b87
 
 
 
 
16316d5
f108b87
 
 
fe9ff70
201dfa5
6c938dd
 
48392ea
6c938dd
34fbcfb
f108b87
34fbcfb
6c938dd
 
 
 
 
f108b87
 
 
 
6c938dd
34fbcfb
f108b87

import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
from string import punctuation
import nltk
nltk.download('words')
from nltk.corpus import words


API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}

def query(text):
    payload = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
similar_words = read_text('similar_words')
addon_words = read_text('addon_words')
profanities = read_text('profanities', 'json')
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
lookup_words = list(set(similar_words).union(set(lookup_profanity.tolist())))
eng_words = list(set(words.words()) - set(lookup_profanity))

# TODO check eng words that are tagalog profanities

def fuzzy_lookup(tweet):

    matched_profanity = []

    for word in tweet.split():

        base_word = word

        if word in eng_words:
            continue
        
        for addon in addon_words:
            if word.startswith(addon):
                word[len(addon):]
            if word.endswith(addon):
                word[:-len(addon)]

        scores = []
        matched_words = []
        word = word.strip(punctuation)
        processed_word = re.sub("[^a-zA-Z0-9@]", "", word)

        if len(processed_word) >= 4:
            # Get fuzzy ratio
            for lookup_word in lookup_words:
                score = fuzz.ratio(processed_word, lookup_word)
                if score >= 70:
                    scores.append(score)
                    matched_words.append(lookup_word)
            if len(scores) > 0:
                max_score_index = np.argmax(scores)
                if matched_words[max_score_index] in lookup_profanity:
                    matched_profanity.append(base_word)

    return matched_profanity


def preprocess(tweet):
    
    tweet = tweet.lower()
    tweet = emoji.replace_emoji(tweet, replace='')

    # Elongated words conversion
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    preprocessed_tweet = ' '.join(filter(None, row_split))

    if len(preprocessed_tweet.split()) == 1:
        return preprocessed_tweet

    # Expand Contractions
    for i in contractions.items():
        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

    return preprocessed_tweet



def predict(tweet):
    
    preprocessed_tweet = preprocess(tweet)
    matched_profanity = fuzzy_lookup(preprocessed_tweet)

    if len(matched_profanity) > 0:

        prediction = query(preprocessed_tweet)

        if type(prediction) == dict: 
            print(prediction)
            error_message = prediction['error']
            return error_message, [[]]
        
        prediction = prediction[0][0]["label"]
        
        print("\nTWEET:", tweet)
        print("DETECTED PROFANITY:", matched_profanity)
        print("LABEL:", prediction, "\n")

        return prediction, [matched_profanity]
    
    return "No Profanity", [[]]


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Text(label="PREDICTION"), gr.List(label="PROFANITIES")],

    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... 🤡',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

    allow_flagging="never",

    title="Tagalog Profanity Classifier"
)

demo.launch(debug=True)
predict("Tangina mo naman gag0 ka ba")