|
import gradio as gr |
|
import requests |
|
import emoji |
|
import re |
|
import json |
|
from thefuzz import process, fuzz |
|
import numpy as np |
|
|
|
|
|
API_URL = "https://api-inference.huggingface.co/models/Dabid/test2" |
|
headers = {"Authorization": "Bearer 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"} |
|
|
|
profanities = ['bobo', 'bwiset','gago', 'kupal', |
|
'pakshet', 'pakyu', 'pucha', |
|
'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina', |
|
'tarantado', 'ulol'] |
|
|
|
def read_text(filename, filetype='txt'): |
|
words = [] |
|
|
|
if filetype == 'txt': |
|
with open(filename + '.txt') as file: |
|
words = [line.rstrip() for line in file] |
|
words = list(set(words)) |
|
elif filetype == 'json': |
|
with open(filename + '.json') as json_file: |
|
words = json.load(json_file) |
|
|
|
return words |
|
|
|
|
|
contractions = read_text('contractions', 'json') |
|
lookup_words = read_text('lookup_words') |
|
obj_pronouns = read_text('obj_pronouns') |
|
profanities = read_text('profanities', 'json') |
|
|
|
|
|
def fuzzyLookup(tweet): |
|
lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())]) |
|
obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng'] |
|
matches = dict() |
|
|
|
|
|
for word in tweet.split(): |
|
scores = [] |
|
matched_words = [] |
|
|
|
if len(word) >= 4: |
|
|
|
for lookup_word in lookup_words: |
|
score = fuzz.ratio(word, lookup_word) |
|
if score >= 65: |
|
scores.append(score) |
|
matched_words.append(lookup_word) |
|
if len(scores) > 0: |
|
max_score_index = np.argmax(scores) |
|
if matched_words[max_score_index] in lookup_profanity: |
|
matches[word] = matched_words[max_score_index] |
|
|
|
|
|
for word, matched_profanity in matches.items(): |
|
word_split = word.split(matched_profanity[-2:]) |
|
for pronoun in obj_pronoun: |
|
if len(word_split) > 1: |
|
if pronoun == word_split[-1]: |
|
matches[word] = matched_profanity + ' ' + pronoun |
|
break |
|
|
|
|
|
for word, matched_profanity in matches.items(): |
|
tweet = tweet.replace(word, matched_profanity) |
|
|
|
tweet_split = tweet.split() |
|
for profanity, prof_varations in profanities.items(): |
|
for i, word in enumerate(tweet_split): |
|
if word in prof_varations: |
|
tweet_split[i] = profanity |
|
tweet = ' '.join(tweet_split) |
|
|
|
return tweet, json.dumps(matches) |
|
|
|
|
|
def preprocess(text): |
|
laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] |
|
symbols = ['@', '#'] |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
text = emoji.replace_emoji(text, replace='') |
|
|
|
|
|
text = re.sub(r'(.)\1{2,}', r'\1', text) |
|
|
|
|
|
row_split = text.split() |
|
|
|
for index, word in enumerate(row_split): |
|
|
|
|
|
if any(x in word for x in symbols): |
|
row_split[index] = '' |
|
|
|
|
|
if 'http' in word: |
|
row_split[index] = '' |
|
|
|
|
|
if any(x in word for x in laugh_texts): |
|
row_split[index] = 'haha' |
|
|
|
|
|
if any(x.isdigit() for x in word): |
|
row_split[index] = '' |
|
|
|
|
|
combined_text = ' '.join(filter(None, row_split)) |
|
|
|
|
|
if len(combined_text.split()) == 1: |
|
return combined_text |
|
|
|
|
|
combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text) |
|
|
|
|
|
for i in contractions.items(): |
|
combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text) |
|
|
|
return combined_text |
|
|
|
|
|
def query(payload): |
|
response = requests.post(API_URL, headers=headers, json=payload) |
|
return response.json() |
|
|
|
|
|
def predict(text): |
|
text= preprocess(text) |
|
text, matches = fuzzyLookup(text) |
|
output = query(text) |
|
|
|
if 'error' in output: |
|
return output['error'], 'Error occured. Try again later.', {"error": "error"} |
|
else: |
|
output = [tuple(i.values()) for i in output[0]] |
|
output = dict((x, y) for x, y in output) |
|
|
|
predicted_label = list(output.keys())[0] |
|
|
|
if predicted_label == 'Abusive': |
|
output_text = text |
|
for profanity in profanities: |
|
compiled = re.compile(re.escape(profanity), re.IGNORECASE) |
|
mask = "" |
|
for i in profanity: |
|
mask += "*" if i != " " else " " |
|
output_text = compiled.sub(mask, output_text) |
|
return output, output_text, matches |
|
else: |
|
return output, text, matches |
|
|
|
|
|
|
|
|
|
hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks') |
|
|
|
|
|
demo = gr.Interface( |
|
fn=predict, |
|
|
|
inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], |
|
|
|
outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"), |
|
gr.components.Text(label='OUTPUT'), |
|
gr.components.JSON()], |
|
|
|
examples=['Tangina mo naman sobrang yabang mo gago!!๐ ๐ค @davidrafael', |
|
'Napakainit ngayong araw pakshet namaaan!!', |
|
'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', |
|
'Bobo ka ba? napakadali lang nyan eh... ๐คก', |
|
'Uy gago laptrip yung nangyare samen kanina HAHAHA๐๐'], |
|
|
|
allow_flagging="manual", |
|
flagging_callback=hf_writer, |
|
flagging_options=['Good bot', 'Bad bot'] |
|
) |
|
|
|
demo.launch() |
|
|