import gradio as gr import requests import emoji import re API_URL = "https://api-inference.huggingface.co/models/Dabid/test2" headers = {"Authorization": "Bearer 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"} profanities = ['bobo', 'bobong', 'bwiset', 'bwisit', 'buwisit', 'buwiset', 'bwesit', 'gago', 'gagong', 'kupal', 'pakshet', 'pakyu', 'pucha', 'puchang', 'punyeta', 'punyetang', 'puta', 'putang', 'putangina', 'putanginang', 'tanga', 'tangang', 'tangina', 'tanginang', 'tarantado', 'tarantadong', 'ulol'] contractions = { 'di': 'hindi', 'to': 'ito', 'no': 'ano', 'kundi': 'kung hindi', 'nya': 'niya', 'nyo': 'ninyo', 'niyo': 'ninyo', 'pano': 'paano', 'sainyo': 'sa inyo', 'sayo': 'sa iyo', 'pag': 'kapag', 'kesa': 'kaysa', 'dun': 'doon', 'ganto': 'ganito', 'nandun': 'nandoon', 'saka': 'tsaka', 'ung': 'yung', 'wag': 'huwag', 'sya': 'siya', 'bat': 'bakit', 'yon': 'iyon', 'yun': 'iyon', 'dyan': 'diyan', 'jan': 'diyan', 'andito': 'nandito', 'tanginamo': 'tangina mo', 'putanginamo': 'putangina mo', 'san': 'saan', 'ganun': 'ganoon', 'gagong': 'gago na', 'bobong': 'bobo na', 'tangang': 'tanga na', 'kelan': 'kailan', 'raw': 'daw', 'tanginang': 'tangina na', 'tarantadong': 'tarantado na', 'putang ina': 'putangina', 'putang inang': 'putangina', 'putanginang': 'putangina', 'itong': 'ito ang', 'lng': 'lang', 'bwisit': 'bwiset', 'bwesit': 'bwiset', 'buwisit': 'bwiset', 'buwesit': 'bwiset' } def preprocess(row): laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha'] symbols = ['@', '#'] # Lowercase row = row.lower() # Remove emojis row = emoji.replace_emoji(row, replace='') # Replace elongated words 'grabeee' -> 'grabe' (not applicable on 2 corresponding letter) row = re.sub(r'(.)\1{2,}', r'\1', row) # Split sentence into list of words row_split = row.split() for index, word in enumerate(row_split): # Remove words with symbols (e.g. @username, #hashtags) if any(x in word for x in symbols): row_split[index] = '' # Remove links if 'http' in word: row_split[index] = '' # Unify laugh texts format to 'haha' if any(x in word for x in laugh_texts): row_split[index] = 'haha' # Remove words with digits (4ever) if any(x.isdigit() for x in word): row_split[index] = '' # Combine list of words back to sentence combined_text = ' '.join(filter(None, row_split)) # Check if output contains single word then return null if len(combined_text.split()) == 1: return combined_text # Filter needed characters combined_text = re.sub(r"[^A-Za-z ]+", '', combined_text) # Expand Contractions for i in contractions.items(): combined_text = re.sub(rf"\b{i[0]}\b", i[1], combined_text) return combined_text def query(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.json() def predict(text): output = query(preprocess(text)) print(preprocess(text)) if 'error' in output: return output['error'], 'Error occured. Try again later.' else: output = [tuple(i.values()) for i in output[0]] output = dict((x, y) for x, y in output) predicted_label = list(output.keys())[0] if predicted_label == 'Abusive': output_text = text for profanity in profanities: compiled = re.compile(re.escape(profanity), re.IGNORECASE) mask = "" for i in profanity: mask += "*" if i != " " else " " output_text = compiled.sub(mask, output_text) return output, output_text else: return output, text hf_writer = gr.HuggingFaceDatasetSaver('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', 'tagalog-profanity-feedbacks') demo = gr.Interface( fn=predict, inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')], outputs=[gr.components.Label(num_top_classes=2, label="PREDICTION"), gr.components.Text(label='OUTPUT')], examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael', 'Napakainit ngayong araw pakshet namaaan!!', 'Napakabagal naman ng wifi tangina #PLDC #HelloDITO', 'Bobo ka ba? napakadali lang nyan eh... 🤡', 'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'], allow_flagging="manual", flagging_callback=hf_writer, flagging_options=['Good bot', 'Bad bot'] ) demo.launch()