mginoben's picture
Increased Fuzzy Threshold (65-70)
eb2943c
raw
history blame
5.95 kB
import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2"
headers = {"Authorization": "Bearer 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"}
def read_text(filename, filetype='txt'):
words = []
if filetype == 'txt':
with open(filename + '.txt') as file:
words = [line.rstrip() for line in file]
words = list(set(words))
elif filetype == 'json':
with open(filename + '.json') as json_file:
words = json.load(json_file)
return words
contractions = read_text('contractions', 'json')
lookup_words = read_text('lookup_words')
obj_pronouns = read_text('obj_pronouns')
profanities = read_text('profanities', 'json')
def fuzzy_lookup(tweet):
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())])
matches = dict()
# Loop each word in tweet
for word in tweet.split():
# Only get digits and letters then lowercase
word = re.sub("[^a-zA-Z0-9@]", "", word).lower()
scores = []
matched_words = []
# If word > 4 chars
if len(word) >= 4:
# Get fuzzy ratio
for lookup_word in lookup_words:
score = fuzz.ratio(word, lookup_word)
if score >= 70:
scores.append(score)
matched_words.append(lookup_word)
if len(scores) > 0:
max_score_index = np.argmax(scores)
if matched_words[max_score_index] in lookup_profanity:
matches[word] = matched_words[max_score_index]
for word, matched_profanity in matches.items():
word_split = word.split(matched_profanity[-2:])
for pronoun in obj_pronouns:
if len(word_split) > 1:
if pronoun == word_split[-1]:
matches[word] = matched_profanity + ' ' + pronoun
break
# Replace each profanities by fuzzy lookup result
for word, matched_profanity in matches.items():
tweet = tweet.replace(word, matched_profanity)
for profanity, prof_varations in profanities.items():
if len(prof_varations) > 0:
for prof_variant in prof_varations:
tweet = tweet.replace(prof_variant, profanity)
return tweet, matches
def preprocess(tweet):
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
symbols = ['@', '#']
# Lowercase
tweet = tweet.lower()
# Remove emojis
tweet = emoji.replace_emoji(tweet, replace='')
# Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter)
tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)
# Split sentence into list of words
row_split = tweet.split()
for index, word in enumerate(row_split):
# Remove words with symbols (e.g. @username, #hashtags)
if any(x in word for x in symbols):
row_split[index] = ''
# Remove links
if 'http' in word:
row_split[index] = ''
# Unify laugh texts format to 'haha'
if any(x in word for x in laugh_texts):
row_split[index] = 'haha'
# Combine list of words back to sentence
combined_text = ' '.join(filter(None, row_split))
# Check if output contains single word then return null
if len(combined_text.split()) == 1:
return combined_text
# Filter needed characters
combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text)
# Expand Contractions
for i in contractions.items():
combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text)
return combined_text
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def predict(tweet):
fuzzy_text, matches = fuzzy_lookup(tweet)
processed_text = preprocess(fuzzy_text)
output = query(processed_text)
if 'error' in output:
return output['error'], 'Error occured. Try again later.', {}
elif len(matches) == 0:
return 'No Profanity Found.', '', {}
else:
output = [tuple(i.values()) for i in output[0]]
output = dict((x, y) for x, y in output)
predicted_label = list(output.keys())[0]
if predicted_label == 'Abusive':
# Censor
for base_word, _ in matches.items():
mask = '*' * len(base_word)
compiled = re.compile(re.escape(base_word), re.IGNORECASE)
tweet = compiled.sub(mask, tweet)
# tweet = tweet.replace(base_word, re.sub("[a-zA-Z0-9@]", "*", base_word))
return output, tweet, json.dumps(matches)
else:
return output, tweet, json.dumps(matches)
# output, tweet, matches = predict('ul0L Sama ng ugali mo pre Tangina uL0l!!!')
# print(output, '\n', tweet, '\n', matches)
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks')
demo = gr.Interface(
fn=predict,
inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"),
gr.components.Text(label='OUTPUT'),
gr.components.JSON(label='DETECTED PROFANITIES')],
examples=['Tangina mo naman sobrang yabang mo gago!!😠😀 @davidrafael',
'Napakainit ngayong araw pakshet namaaan!!',
'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
'Bobo ka ba? napakadali lang nyan eh... 🀑',
'Uy gago laptrip yung nangyare samen kanina HAHAHAπŸ˜‚πŸ˜‚'],
allow_flagging="manual",
flagging_callback=hf_writer,
flagging_options=['Good bot', 'Bad bot']
)
demo.launch()