Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

File size: 5,556 Bytes

34fbcfb
 
 
 
3172d47
 
 
bce56c0
3172d47
34fbcfb
 
 
 
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6912dca
3172d47
 
 
 
 
bce56c0
 
3172d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6912dca
3172d47
 
 
 
 
 
 
 
 
 
6912dca
 
 
3172d47
6912dca
bce56c0
3172d47
 
bce56c0
34fbcfb
 
 
 
bce56c0
34fbcfb
 
bce56c0
34fbcfb
 
bce56c0
34fbcfb
 
bce56c0
34fbcfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bce56c0
 
6912dca
 
1371b6f
48392ea
3172d47
34fbcfb
48392ea
 
 
 
 
bce56c0
6912dca
bce56c0
 
 
48392ea
bce56c0
 
34fbcfb
 
 
 
 
 
 
 
 
3172d47
bce56c0
34fbcfb
 
 
 
 
 
 
 
 
 
 
 
b862f6c

import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re


API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
headers = {"Authorization": "Bearer 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"}

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
lookup_words = read_text('lookup_words')
obj_pronouns = read_text('obj_pronouns')
profanities = read_text('profanities', 'json')


def fuzzy_lookup(tweet):
    lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
    matches = dict()

    # Loop each word in tweet
    for word in tweet.split():
         # Only get digits and letters
        word = re.sub("[^a-zA-Z0-9@]", "", word)
        scores = []
        matched_words = []
        # If word > 4 chars
        if len(word) >= 4:
            # Get fuzzy ratio
            for lookup_word in lookup_words:
                score = fuzz.ratio(word, lookup_word)
                if score >= 65:
                    scores.append(score)
                    matched_words.append(lookup_word)
            if len(scores) > 0:
                max_score_index = np.argmax(scores)
                if matched_words[max_score_index] in lookup_profanity:
                    matches[word] = matched_words[max_score_index]
                

    for word, matched_profanity in matches.items():
        word_split = word.split(matched_profanity[-2:])
        for pronoun in obj_pronouns:
            if len(word_split) > 1:
                if pronoun == word_split[-1]:
                    matches[word] = matched_profanity + ' ' + pronoun
                    break

    # Replace each profanities by fuzzy lookup result
    for word, matched_profanity in matches.items():
        tweet = tweet.replace(word, matched_profanity)

    for profanity, prof_varations in profanities.items():
        if len(prof_varations) > 0:
            for prof_variant in prof_varations:
                tweet = tweet.replace(prof_variant, profanity)

    print('Fuzzy Returns:', tweet)
    return tweet, matches


def preprocess(tweet):
    laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
    symbols = ['@', '#']

    # Lowercase
    tweet = tweet.lower()

    # Remove emojis
    tweet = emoji.replace_emoji(tweet, replace='')

    # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    # Split sentence into list of words
    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove words with symbols (e.g. @username, #hashtags)
        if any(x in word for x in symbols):
            row_split[index] = ''

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    combined_text = ' '.join(filter(None, row_split))

    # Check if output contains single word then return null
    if len(combined_text.split()) == 1:
        return combined_text

    # Filter needed characters
    combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)

    # Expand Contractions
    for i in contractions.items():
        combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)

    return combined_text


def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


def predict(tweet):
    
    fuzzy_text, matches = fuzzy_lookup(tweet)
    output = query(preprocess(fuzzy_text))
    
    if 'error' in output:
        return output['error'], 'Error occured. Try again later.', {"error": "error"}
    else:
        output = [tuple(i.values()) for i in output[0]]
        output = dict((x, y) for x, y in output)
        predicted_label = list(output.keys())[0]

        if predicted_label == 'Abusive':
            for base_word, _ in matches.items():

                tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))

            return output, tweet, json.dumps(matches)
        else:
            return output, tweet, json.dumps(matches)

hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
             gr.components.Text(label='OUTPUT'),
             gr.components.JSON(label='DETECTED PROFANITIES')],

    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... 🤡',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

    allow_flagging="manual",
    flagging_callback=hf_writer,
    flagging_options=['Good bot', 'Bad bot']
)

demo.launch()