Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

File size: 6,066 Bytes

import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np


API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
headers = {"Authorization": "Bearer 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"}

profanities = ['bobo', 'bwiset','gago', 'kupal',
               'pakshet', 'pakyu', 'pucha',
               'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina',
               'tarantado', 'ulol']

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
lookup_words = read_text('lookup_words')
obj_pronouns = read_text('obj_pronouns')
profanities = read_text('profanities', 'json')


def fuzzyLookup(tweet):
    lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
    obj_pronoun  = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng']
    matches = dict()

    # Loop each word in tweet
    for word in tweet.split():
        scores = []
        matched_words = []
        # If word > 4 chars
        if len(word) >= 4:
            # Get fuzzy ratio
            for lookup_word in lookup_words:
                score = fuzz.ratio(word, lookup_word)
                if score >= 65:
                    scores.append(score)
                    matched_words.append(lookup_word)
            if len(scores) > 0:
                max_score_index = np.argmax(scores)
                if matched_words[max_score_index] in lookup_profanity:
                    matches[word] = matched_words[max_score_index]
                

    for word, matched_profanity in matches.items():
        word_split = word.split(matched_profanity[-2:])
        for pronoun in obj_pronoun:
            if len(word_split) > 1:
                if pronoun == word_split[-1]:
                    matches[word] = matched_profanity + ' ' + pronoun
                    break

    # Replace each profanities by fuzzy lookup result
    for word, matched_profanity in matches.items():
        tweet = tweet.replace(word, matched_profanity)

    tweet_split = tweet.split()
    for profanity, prof_varations in profanities.items():
        for i, word in enumerate(tweet_split):
            if word in prof_varations:
                tweet_split[i] = profanity
    tweet = ' '.join(tweet_split)

    return tweet, json.dumps(matches)


def preprocess(text):
    laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
    symbols = ['@', '#']

    # Lowercase
    text = text.lower()

    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    # Split sentence into list of words
    row_split = text.split()

    for index, word in enumerate(row_split):

        # Remove words with symbols (e.g. @username, #hashtags)
        if any(x in word for x in symbols):
            row_split[index] = ''

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

        # Remove words with digits (4ever)
        if any(x.isdigit() for x in word):
            row_split[index] = ''

    # Combine list of words back to sentence
    combined_text = ' '.join(filter(None, row_split))

    # Check if output contains single word then return null
    if len(combined_text.split()) == 1:
        return combined_text

    # Filter needed characters
    combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)

    # Expand Contractions
    for i in contractions.items():
        combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)

    return combined_text


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


def predict(text):
    text= preprocess(text)
    text, matches = fuzzyLookup(text)
    output = query(text)
    
    if 'error' in output:
        return output['error'], 'Error occured. Try again later.', {"error": "error"}
    else:
        output = [tuple(i.values()) for i in output[0]]
        output = dict((x, y) for x, y in output)

        predicted_label = list(output.keys())[0]

        if predicted_label == 'Abusive':
            output_text = text
            for profanity in profanities:
                compiled = re.compile(re.escape(profanity), re.IGNORECASE)
                mask = ""
                for i in profanity:
                    mask += "*" if i != " " else " "
                output_text = compiled.sub(mask, output_text)
            return output, output_text, matches
        else:
            return output, text, matches
    
        # TODO gag0 not appearing


hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
             gr.components.Text(label='OUTPUT'),
             gr.components.JSON()],

    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... 🤡',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

    allow_flagging="manual",
    flagging_callback=hf_writer,
    flagging_options=['Good bot', 'Bad bot']
)

demo.launch()