mginoben's picture
Profanity with hashtag detection
bf5fae7
raw
history blame
6 kB
import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
import nltk
from english_words import get_english_words_set
API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}
def query(text):
payload = {"inputs": text}
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def read_text(filename, filetype='txt'):
words = []
if filetype == 'txt':
with open(filename + '.txt') as file:
words = [line.rstrip() for line in file]
words = list(set(words))
elif filetype == 'json':
with open(filename + '.json') as json_file:
words = json.load(json_file)
return words
contractions = read_text('contractions', 'json')
similar_words = read_text('similar_words')
addon_words = read_text('addon_words')
profanities_dict = read_text('profanities', 'json')
lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
lookup_words = list(set(similar_words).union(set(lookup_profanity)))
eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')
def fuzzy_lookup(tweet):
matched_profanity = dict()
for word in tweet.split():
if word in eng_words:
continue
scores = []
matched_words = []
matched_word = None
# Remove trailing punctuations except # and @
word = punctuations.sub('', word).lower()
# Save base word
base_word = word
# Shortent elongated word
word = re.sub(r'(.)\1{2,}', r'\1', word)
# Remove # and @
if word.startswith("#") or word.startswith("@"):
word = word[1:]
# Remove trailing words (mo, ka, pinaka)
for addon in addon_words:
if word.startswith(addon):
word = word[len(addon):]
if word.endswith(addon):
word = word[:-len(addon)]
if len(word) < 4:
continue
# Get fuzzy ratio
for lookup_word in lookup_words:
score = fuzz.ratio(word, lookup_word)
# Threshold
if score >= 70:
scores.append(score)
matched_words.append(lookup_word)
if len(scores) == 0:
continue
if len(set(scores)) == 1:
for matched_word in matched_words:
if matched_word in lookup_profanity:
matched_word = matched_word
break
else:
# Get matched word with max score
max_score_index = np.argmax(scores)
matched_word = matched_words[max_score_index]
if matched_word not in lookup_profanity:
continue
for base_profanity, profanity_variations in profanities_dict.items():
if matched_word in profanity_variations or matched_word == base_profanity:
# Seperate pronouns
for addon in addon_words:
if base_word.endswith(addon):
base_profanity = base_profanity + " " + addon
break
matched_profanity[base_word] = base_profanity
break
return matched_profanity
def preprocess(tweet, profanities):
tweet = tweet.lower()
tweet = emoji.replace_emoji(tweet, replace='')
# Replace profanities
for base_word, matched_word in profanities.items():
tweet = tweet.replace(base_word, matched_word)
# Elongated words conversion
tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
row_split = tweet.split()
for index, word in enumerate(row_split):
# Remove links
if 'http' in word:
row_split[index] = ''
# Unify laugh texts format to 'haha'
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
if any(x in word for x in laugh_texts):
row_split[index] = 'haha'
# Combine list of words back to sentence
preprocessed_tweet = ' '.join(filter(None, row_split))
if len(preprocessed_tweet.split()) == 1:
return preprocessed_tweet
# Expand Contractions
for i in contractions.items():
preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)
return preprocessed_tweet
def predict(tweet):
profanities = fuzzy_lookup(tweet)
if len(profanities) > 0:
preprocessed_tweet = preprocess(tweet, profanities)
prediction = query(preprocessed_tweet)
if type(prediction) == dict:
print(prediction)
error_message = prediction['error']
return error_message, {}
prediction = prediction[0][0]["label"]
print("\nTWEET:", tweet)
print("PROCESSED TWEET:", preprocessed_tweet)
print("DETECTED PROFANITY:", list(profanities.keys()))
print("LABEL:", prediction, "\n")
return prediction, list(profanities.keys())
return "No Profanity", {}
demo = gr.Interface(
fn=predict,
inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")],
examples=['Tangina mo naman sobrang yabang mo gago!!๐Ÿ˜ ๐Ÿ˜ค @davidrafael',
'Napakainit ngayong araw pakshet namaaan!!',
'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
'Bobo ka ba? napakadali lang nyan eh... ๐Ÿคก',
'Uy gago laptrip yung nangyare samen kanina HAHAHA๐Ÿ˜‚๐Ÿ˜‚'],
allow_flagging="never",
title="Tagalog Profanity Classifier"
)
demo.launch(debug=True)