import gradio as gr import requests import emoji import re import json from thefuzz import process, fuzz import numpy as np API_URL = "https://api-inference.huggingface.co/models/Dabid/test2" headers = {"Authorization": "Bearer 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"} profanities = ['bobo', 'bwiset','gago', 'kupal', 'pakshet', 'pakyu', 'pucha', 'punyeta', 'puta', 'pota', 'putangina', 'tanga', 'tangina', 'tarantado', 'ulol'] def read_text(filename, filetype='txt'): words = [] if filetype == 'txt': with open(filename + '.txt') as file: words = [line.rstrip() for line in file] words = list(set(words)) elif filetype == 'json': with open(filename + '.json') as json_file: words = json.load(json_file) return words contractions = read_text('contractions', 'json') lookup_words = read_text('lookup_words') obj_pronouns = read_text('obj_pronouns') profanities = read_text('profanities', 'json') def fuzzyLookup(tweet): lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())]) obj_pronoun = ['ko', 'mo', 'nya', 'natin', 'namin', 'ninyo', 'nila', 'ka', 'nyo', 'ng'] matches = dict() # Loop each word in tweet for word in tweet.split(): scores = [] matched_words = [] # If word > 4 chars if len(word) >= 4: # Get fuzzy ratio for lookup_word in lookup_words: score = fuzz.ratio(word, lookup_word) if score >= 65: scores.append(score) matched_words.append(lookup_word) if len(scores) > 0: max_score_index = np.argmax(scores) if matched_words[max_score_index] in lookup_profanity: matches[word] = matched_words[max_score_index] for word, matched_profanity in matches.items(): word_split = word.split(matched_profanity[-2:]) for pronoun in obj_pronoun: if len(word_split) > 1: if pronoun == word_split[-1]: matches[word] = matched_profanity + ' ' + pronoun break # Replace each profanities by fuzzy lookup result for word, matched_profanity in matches.items(): tweet = tweet.replace(word, matched_profanity) tweet_split = tweet.split() for profanity, prof_varations in profanities.items(): for i, word in enumerate(tweet_split): if word in prof_varations: tweet_split[i] = profanity tweet = ' '.join(tweet_split) return tweet, json.dumps(matches) def preprocess(text): laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] symbols = ['@', '#'] # Lowercase text = text.lower() # Remove emojis text = emoji.replace_emoji(text, replace='') # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter) text = re.sub(r'(.)\1{2,}', r'\1', text) # Split sentence into list of words row_split = text.split() for index, word in enumerate(row_split): # Remove words with symbols (e.g. @username, #hashtags) if any(x in word for x in symbols): row_split[index] = '' # Remove links if 'http' in word: row_split[index] = '' # Unify laugh texts format to 'haha' if any(x in word for x in laugh_texts): row_split[index] = 'haha' # Remove words with digits (4ever) if any(x.isdigit() for x in word): row_split[index] = '' # Combine list of words back to sentence combined_text = ' '.join(filter(None, row_split)) # Check if output contains single word then return null if len(combined_text.split()) == 1: return combined_text # Filter needed characters combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text) # Expand Contractions for i in contractions.items(): combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text) return combined_text def query(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.json() def predict(text): text= preprocess(text) text, matches = fuzzyLookup(text) output = query(text) if 'error' in output: return output['error'], 'Error occured. Try again later.', {"error": "error"} else: output = [tuple(i.values()) for i in output[0]] output = dict((x, y) for x, y in output) predicted_label = list(output.keys())[0] if predicted_label == 'Abusive': output_text = text for profanity in profanities: compiled = re.compile(re.escape(profanity), re.IGNORECASE) mask = "" for i in profanity: mask += "*" if i != " " else " " output_text = compiled.sub(mask, output_text) return output, output_text, matches else: return output, text, matches # TODO gag0 not appearing hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks') demo = gr.Interface( fn=predict, inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"), gr.components.Text(label='OUTPUT'), gr.components.JSON()], examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael', 'Napakainit ngayong araw pakshet namaaan!!', 'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', 'Bobo ka ba? napakadali lang nyan eh... 🤡', 'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'], allow_flagging="manual", flagging_callback=hf_writer, flagging_options=['Good bot', 'Bad bot'] ) demo.launch()