import gradio as gr import requests import emoji import re import json from thefuzz import process, fuzz import numpy as np import re import nltk from english_words import get_english_words_set API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection" headers = {"Authorization": "Bearer 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"} def query(text): payload = {"inputs": text} response = requests.post(API_URL, headers=headers, json=payload) return response.json() def read_text(filename, filetype='txt'): words = [] if filetype == 'txt': with open(filename + '.txt') as file: words = [line.rstrip() for line in file] words = list(set(words)) elif filetype == 'json': with open(filename + '.json') as json_file: words = json.load(json_file) return words contractions = read_text('contractions', 'json') similar_words = read_text('similar_words') addon_words = read_text('addon_words') profanities_dict = read_text('profanities', 'json') lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist() lookup_words = list(set(similar_words).union(set(lookup_profanity))) eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity)) punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$') def fuzzy_lookup(tweet): matched_profanity = dict() for word in tweet.split(): if word in eng_words: continue scores = [] matched_words = [] matched_word = None # Remove trailing punctuations except # and @ word = punctuations.sub('', word).lower() # Save base word base_word = word # Shortent elongated word word = re.sub(r'(.)\1{2,}', r'\1', word) # Remove # and @ if word.startswith("#") or word.startswith("@"): word = word[1:] # Remove trailing words (mo, ka, pinaka) for addon in addon_words: if word.startswith(addon): word = word[len(addon):] if word.endswith(addon): word = word[:-len(addon)] if len(word) < 4: continue # Get fuzzy ratio for lookup_word in lookup_words: score = fuzz.ratio(word, lookup_word) # Threshold if score >= 70: scores.append(score) matched_words.append(lookup_word) if len(scores) == 0: continue if len(set(scores)) == 1: for matched_word in matched_words: if matched_word in lookup_profanity: matched_word = matched_word break else: # Get matched word with max score max_score_index = np.argmax(scores) matched_word = matched_words[max_score_index] if matched_word not in lookup_profanity: continue for base_profanity, profanity_variations in profanities_dict.items(): if matched_word in profanity_variations or matched_word == base_profanity: # Seperate pronouns for addon in addon_words: if base_word.endswith(addon): base_profanity = base_profanity + " " + addon break matched_profanity[base_word] = base_profanity break return matched_profanity def preprocess(tweet, profanities): tweet = tweet.lower() tweet = emoji.replace_emoji(tweet, replace='') # Replace profanities for base_word, matched_word in profanities.items(): tweet = tweet.replace(base_word, matched_word) # Elongated words conversion tweet = re.sub(r'(.)\1{2,}', r'\1', tweet) row_split = tweet.split() for index, word in enumerate(row_split): # Remove links if 'http' in word: row_split[index] = '' # Unify laugh texts format to 'haha' laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] if any(x in word for x in laugh_texts): row_split[index] = 'haha' # Combine list of words back to sentence preprocessed_tweet = ' '.join(filter(None, row_split)) if len(preprocessed_tweet.split()) == 1: return preprocessed_tweet # Expand Contractions for i in contractions.items(): preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet) return preprocessed_tweet def predict(tweet): profanities = fuzzy_lookup(tweet) if len(profanities) > 0: preprocessed_tweet = preprocess(tweet, profanities) prediction = query(preprocessed_tweet) if type(prediction) == dict: print(prediction) error_message = prediction['error'] return error_message, {} prediction = prediction[0][0]["label"] print("\nTWEET:", tweet) print("PROCESSED TWEET:", preprocessed_tweet) print("DETECTED PROFANITY:", list(profanities.keys())) print("LABEL:", prediction, "\n") return prediction, list(profanities.keys()) return "No Profanity", {} demo = gr.Interface( fn=predict, inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")], examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael', 'Napakainit ngayong araw pakshet namaaan!!', 'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', 'Bobo ka ba? napakadali lang nyan eh... 🤡', 'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'], allow_flagging="never", title="Tagalog Profanity Classifier" ) demo.launch(debug=True)