import gradio as gr import requests import emoji import re import json from thefuzz import process, fuzz import numpy as np import re from string import punctuation import time API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection" headers = {"Authorization": "Bearer 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"} def query(text): text = {"inputs": text} response = requests.post(API_URL, headers=headers, json=text) return response.json() def read_text(filename, filetype='txt'): words = [] if filetype == 'txt': with open(filename + '.txt') as file: words = [line.rstrip() for line in file] words = list(set(words)) elif filetype == 'json': with open(filename + '.json') as json_file: words = json.load(json_file) return words contractions = read_text('contractions', 'json') lookup_words = read_text('lookup_words') obj_pronouns = read_text('obj_pronouns') profanities = read_text('profanities', 'json') loading_countdown = 0 def fuzzy_lookup(tweet): matched_profanity = dict() lookup_profanity = np.concatenate([np.hstack(list(profanities.values())), list(profanities.keys())]) for word in tweet.split(): scores = [] matched_words = [] word = word.strip(punctuation) processed_word = re.sub("[^a-zA-Z0-9@]", "", word) if len(processed_word) >= 4: # Get fuzzy ratio for lookup_word in lookup_words: score = fuzz.ratio(processed_word, lookup_word) if score >= 70: scores.append(score) matched_words.append(lookup_word) if len(scores) > 0: max_score_index = np.argmax(scores) if matched_words[max_score_index] in lookup_profanity: matched_profanity[word] = matched_words[max_score_index] for word, profanity in matched_profanity.items(): word_split = word.split(profanity[-2:]) for pronoun in obj_pronouns: if len(word_split) > 1: if pronoun == word_split[-1]: matched_profanity[word] = profanity + ' ' + pronoun break # Replace each profanities by fuzzy lookup result for word, profanity in matched_profanity.items(): tweet = tweet.replace(word, profanity) for profanity, prof_varations in profanities.items(): if len(prof_varations) > 0: for prof_variant in prof_varations: tweet = tweet.replace(prof_variant, profanity) return tweet, matched_profanity def preprocess(tweet): tweet = tweet.lower() tweet = emoji.replace_emoji(tweet, replace='') # Elongated words conversion tweet = re.sub(r'(.)\1{2,}', r'\1', tweet) row_split = tweet.split() for index, word in enumerate(row_split): # Remove links if 'http' in word: row_split[index] = '' # Unify laugh texts format to 'haha' laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] if any(x in word for x in laugh_texts): row_split[index] = 'haha' # Combine list of words back to sentence preprocessed_tweet = ' '.join(filter(None, row_split)) # Fuzzy Lookup preprocessed_tweet, matches = fuzzy_lookup(preprocessed_tweet) if len(preprocessed_tweet.split()) == 1: return preprocessed_tweet, matches # Expand Contractions for i in contractions.items(): preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet) return preprocessed_tweet, matches def predict(tweet): global loading_countdown preprocessed_tweet, matched_profanity = preprocess(tweet) prediction = query(preprocessed_tweet) if type(prediction) == dict: loading_time = prediction['estimated_time'] return f"Loading Model (Estimated Time: {loading_time} Seconds)" if bool(matched_profanity) == False: return "No Profanity" prediction = [tuple(i.values()) for i in prediction[0]] prediction = dict((x, y) for x, y in prediction) print("\nTWEET:", tweet) print("DETECTED PROFANITY:", matched_profanity) print("LABELS:", prediction, "\n") return prediction demo = gr.Interface( fn=predict, inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION")], examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael', 'Napakainit ngayong araw pakshet namaaan!!', 'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', 'Bobo ka ba? napakadali lang nyan eh... 🤡', 'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'], ) demo.launch(debug=True)