Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import requests | |
| import emoji | |
| import re | |
| import json | |
| from thefuzz import process, fuzz | |
| import numpy as np | |
| import re | |
| import nltk | |
| nltk.download('words') | |
| from nltk.corpus import words | |
| API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection" | |
| headers = {"Authorization": "Bearer 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"} | |
| def query(text): | |
| payload = {"inputs": text} | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| return response.json() | |
| def read_text(filename, filetype='txt'): | |
| words = [] | |
| if filetype == 'txt': | |
| with open(filename + '.txt') as file: | |
| words = [line.rstrip() for line in file] | |
| words = list(set(words)) | |
| elif filetype == 'json': | |
| with open(filename + '.json') as json_file: | |
| words = json.load(json_file) | |
| return words | |
| contractions = read_text('contractions', 'json') | |
| similar_words = read_text('similar_words') | |
| addon_words = read_text('addon_words') | |
| profanities_dict = read_text('profanities', 'json') | |
| lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist() | |
| lookup_words = list(set(similar_words).union(set(lookup_profanity))) | |
| eng_words = list(set(words.words()) - set(lookup_profanity)) | |
| punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$') | |
| def fuzzy_lookup(tweet): | |
| matched_profanity = dict() | |
| for word in tweet.split(): | |
| if word in eng_words: | |
| continue | |
| scores = [] | |
| matched_words = [] | |
| matched_word = None | |
| # Remove trailing punctuations except # and @ | |
| word = punctuations.sub('', word).lower() | |
| # Save base word | |
| base_word = word | |
| # Shortent elongated word | |
| word = re.sub(r'(.)\1{2,}', r'\1', word) | |
| # Remove # and @ | |
| if word.startswith("#") or word.startswith("@"): | |
| word = word[1:] | |
| # Remove trailing words (mo, ka, pinaka) | |
| for addon in addon_words: | |
| if word.startswith(addon): | |
| word = word[len(addon):] | |
| if word.endswith(addon): | |
| word = word[:-len(addon)] | |
| if len(word) < 4: | |
| continue | |
| # Get fuzzy ratio | |
| for lookup_word in lookup_words: | |
| score = fuzz.ratio(word, lookup_word) | |
| # Threshold | |
| if score >= 70: | |
| scores.append(score) | |
| matched_words.append(lookup_word) | |
| if len(scores) == 0: | |
| continue | |
| if len(set(scores)) == 1: | |
| for matched_word in matched_words: | |
| if matched_word in lookup_profanity: | |
| matched_word = matched_word | |
| break | |
| else: | |
| # Get matched word with max score | |
| max_score_index = np.argmax(scores) | |
| matched_word = matched_words[max_score_index] | |
| if matched_word not in lookup_profanity: | |
| continue | |
| for base_profanity, profanity_variations in profanities_dict.items(): | |
| if matched_word in profanity_variations or matched_word == base_profanity: | |
| # Seperate pronouns | |
| for addon in addon_words: | |
| if base_word.endswith(addon): | |
| base_profanity = base_profanity + " " + addon | |
| break | |
| matched_profanity[base_word] = base_profanity | |
| break | |
| return matched_profanity | |
| def preprocess(tweet, profanities): | |
| tweet = tweet.lower() | |
| tweet = emoji.replace_emoji(tweet, replace='') | |
| # Replace profanities | |
| for base_word, matched_word in profanities.items(): | |
| tweet = tweet.replace(base_word, matched_word) | |
| # Elongated words conversion | |
| tweet = re.sub(r'(.)\1{2,}', r'\1', tweet) | |
| row_split = tweet.split() | |
| for index, word in enumerate(row_split): | |
| # Remove links | |
| if 'http' in word: | |
| row_split[index] = '' | |
| # Unify laugh texts format to 'haha' | |
| laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] | |
| if any(x in word for x in laugh_texts): | |
| row_split[index] = 'haha' | |
| # Combine list of words back to sentence | |
| preprocessed_tweet = ' '.join(filter(None, row_split)) | |
| if len(preprocessed_tweet.split()) == 1: | |
| return preprocessed_tweet | |
| # Expand Contractions | |
| for i in contractions.items(): | |
| preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet) | |
| return preprocessed_tweet | |
| def predict(tweet): | |
| profanities = fuzzy_lookup(tweet) | |
| if len(profanities) > 0: | |
| preprocessed_tweet = preprocess(tweet, profanities) | |
| prediction = query(preprocessed_tweet) | |
| if type(prediction) == dict: | |
| print(prediction) | |
| error_message = prediction['error'] | |
| return error_message, {} | |
| prediction = prediction[0][0]["label"] | |
| print("\nTWEET:", tweet) | |
| print("PROCESSED TWEET:", preprocessed_tweet) | |
| print("DETECTED PROFANITY:", list(profanities.keys())) | |
| print("LABEL:", prediction, "\n") | |
| return prediction, list(profanities.keys()) | |
| return "No Profanity", {} | |
| demo = gr.Interface( | |
| fn=predict, | |
| inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], | |
| outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")], | |
| examples=['Tangina mo naman sobrang yabang mo gago!!๐ ๐ค @davidrafael', | |
| 'Napakainit ngayong araw pakshet namaaan!!', | |
| 'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', | |
| 'Bobo ka ba? napakadali lang nyan eh... ๐คก', | |
| 'Uy gago laptrip yung nangyare samen kanina HAHAHA๐๐'], | |
| allow_flagging="never", | |
| title="Tagalog Profanity Classifier" | |
| ) | |
| demo.launch(debug=True) | |