|
import gradio as gr |
|
import requests |
|
import emoji |
|
import re |
|
import json |
|
from thefuzz import process, fuzz |
|
import numpy as np |
|
import re |
|
import nltk |
|
nltk.download('words') |
|
from nltk.corpus import words |
|
|
|
|
|
API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection" |
|
headers = {"Authorization": "Bearer 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"} |
|
|
|
def query(text): |
|
payload = {"inputs": text} |
|
response = requests.post(API_URL, headers=headers, json=payload) |
|
return response.json() |
|
|
|
def read_text(filename, filetype='txt'): |
|
words = [] |
|
|
|
if filetype == 'txt': |
|
with open(filename + '.txt') as file: |
|
words = [line.rstrip() for line in file] |
|
words = list(set(words)) |
|
elif filetype == 'json': |
|
with open(filename + '.json') as json_file: |
|
words = json.load(json_file) |
|
|
|
return words |
|
|
|
|
|
contractions = read_text('contractions', 'json') |
|
similar_words = read_text('similar_words') |
|
addon_words = read_text('addon_words') |
|
profanities_dict = read_text('profanities', 'json') |
|
lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist() |
|
lookup_words = list(set(similar_words).union(set(lookup_profanity))) |
|
eng_words = list(set(words.words()) - set(lookup_profanity)) |
|
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$') |
|
|
|
def fuzzy_lookup(tweet): |
|
|
|
matched_profanity = dict() |
|
|
|
for word in tweet.split(): |
|
|
|
if word in eng_words: |
|
continue |
|
|
|
scores = [] |
|
matched_words = [] |
|
matched_word = None |
|
|
|
|
|
word = punctuations.sub('', word).lower() |
|
|
|
|
|
base_word = word |
|
|
|
|
|
word = re.sub(r'(.)\1{2,}', r'\1', word) |
|
|
|
|
|
if word.startswith("#") or word.startswith("@"): |
|
word = word[1:] |
|
|
|
|
|
for addon in addon_words: |
|
if word.startswith(addon): |
|
word = word[len(addon):] |
|
if word.endswith(addon): |
|
word = word[:-len(addon)] |
|
|
|
if len(word) < 4: |
|
continue |
|
|
|
|
|
for lookup_word in lookup_words: |
|
|
|
score = fuzz.ratio(word, lookup_word) |
|
|
|
|
|
if score >= 70: |
|
scores.append(score) |
|
matched_words.append(lookup_word) |
|
|
|
if len(scores) == 0: |
|
continue |
|
|
|
if len(set(scores)) == 1: |
|
for matched_word in matched_words: |
|
if matched_word in lookup_profanity: |
|
matched_word = matched_word |
|
break |
|
else: |
|
|
|
max_score_index = np.argmax(scores) |
|
matched_word = matched_words[max_score_index] |
|
|
|
if matched_word not in lookup_profanity: |
|
continue |
|
|
|
for base_profanity, profanity_variations in profanities_dict.items(): |
|
|
|
if matched_word in profanity_variations or matched_word == base_profanity: |
|
|
|
|
|
for addon in addon_words: |
|
if base_word.endswith(addon): |
|
base_profanity = base_profanity + " " + addon |
|
break |
|
|
|
matched_profanity[base_word] = base_profanity |
|
break |
|
|
|
return matched_profanity |
|
|
|
|
|
def preprocess(tweet, profanities): |
|
|
|
tweet = tweet.lower() |
|
tweet = emoji.replace_emoji(tweet, replace='') |
|
|
|
|
|
for base_word, matched_word in profanities.items(): |
|
tweet = tweet.replace(base_word, matched_word) |
|
|
|
|
|
tweet = re.sub(r'(.)\1{2,}', r'\1', tweet) |
|
|
|
row_split = tweet.split() |
|
|
|
for index, word in enumerate(row_split): |
|
|
|
|
|
if 'http' in word: |
|
row_split[index] = '' |
|
|
|
|
|
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] |
|
if any(x in word for x in laugh_texts): |
|
row_split[index] = 'haha' |
|
|
|
|
|
preprocessed_tweet = ' '.join(filter(None, row_split)) |
|
|
|
if len(preprocessed_tweet.split()) == 1: |
|
return preprocessed_tweet |
|
|
|
|
|
for i in contractions.items(): |
|
preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet) |
|
|
|
return preprocessed_tweet |
|
|
|
|
|
|
|
def predict(tweet): |
|
|
|
profanities = fuzzy_lookup(tweet) |
|
|
|
if len(profanities) > 0: |
|
|
|
preprocessed_tweet = preprocess(tweet, profanities) |
|
|
|
prediction = query(preprocessed_tweet) |
|
|
|
if type(prediction) == dict: |
|
print(prediction) |
|
error_message = prediction['error'] |
|
return error_message, {} |
|
|
|
prediction = prediction[0][0]["label"] |
|
|
|
print("\nTWEET:", tweet) |
|
print("PROCESSED TWEET:", preprocessed_tweet) |
|
print("DETECTED PROFANITY:", list(profanities.keys())) |
|
print("LABEL:", prediction, "\n") |
|
|
|
return prediction, list(profanities.keys()) |
|
|
|
return "No Profanity", {} |
|
|
|
|
|
demo = gr.Interface( |
|
fn=predict, |
|
|
|
inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], |
|
|
|
outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")], |
|
|
|
examples=['Tangina mo naman sobrang yabang mo gago!!๐ ๐ค @davidrafael', |
|
'Napakainit ngayong araw pakshet namaaan!!', |
|
'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', |
|
'Bobo ka ba? napakadali lang nyan eh... ๐คก', |
|
'Uy gago laptrip yung nangyare samen kanina HAHAHA๐๐'], |
|
|
|
allow_flagging="never", |
|
|
|
title="Tagalog Profanity Classifier" |
|
) |
|
|
|
demo.launch(debug=True) |
|
|