Spaces:

mginoben
/

tagalog-profanity-classification

Sleeping

App Files Files Community

tagalog-profanity-classification / app.py

mginoben

Profanity with hashtag detection

bf5fae7 over 1 year ago

raw

history blame

6 kB

	import gradio as gr
	import requests
	import emoji
	import re
	import json
	from thefuzz import process, fuzz
	import numpy as np
	import re
	import nltk
	from english_words import get_english_words_set


	API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
	headers = {"Authorization": "Bearer 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"}

	def query(text):
	payload = {"inputs": text}
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

	def read_text(filename, filetype='txt'):
	words = []

	if filetype == 'txt':
	with open(filename + '.txt') as file:
	words = [line.rstrip() for line in file]
	words = list(set(words))
	elif filetype == 'json':
	with open(filename + '.json') as json_file:
	words = json.load(json_file)

	return words


	contractions = read_text('contractions', 'json')
	similar_words = read_text('similar_words')
	addon_words = read_text('addon_words')
	profanities_dict = read_text('profanities', 'json')
	lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
	lookup_words = list(set(similar_words).union(set(lookup_profanity)))
	eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
	punctuations = re.compile(r'^[^\w#@]+\|[^\w#@]+$')

	def fuzzy_lookup(tweet):

	matched_profanity = dict()

	for word in tweet.split():

	if word in eng_words:
	continue

	scores = []
	matched_words = []
	matched_word = None

	# Remove trailing punctuations except # and @
	word = punctuations.sub('', word).lower()

	# Save base word
	base_word = word

	# Shortent elongated word
	word = re.sub(r'(.)\1{2,}', r'\1', word)

	# Remove # and @
	if word.startswith("#") or word.startswith("@"):
	word = word[1:]

	# Remove trailing words (mo, ka, pinaka)
	for addon in addon_words:
	if word.startswith(addon):
	word = word[len(addon):]
	if word.endswith(addon):
	word = word[:-len(addon)]

	if len(word) < 4:
	continue

	# Get fuzzy ratio
	for lookup_word in lookup_words:

	score = fuzz.ratio(word, lookup_word)

	# Threshold
	if score >= 70:
	scores.append(score)
	matched_words.append(lookup_word)

	if len(scores) == 0:
	continue

	if len(set(scores)) == 1:
	for matched_word in matched_words:
	if matched_word in lookup_profanity:
	matched_word = matched_word
	break
	else:
	# Get matched word with max score
	max_score_index = np.argmax(scores)
	matched_word = matched_words[max_score_index]

	if matched_word not in lookup_profanity:
	continue

	for base_profanity, profanity_variations in profanities_dict.items():

	if matched_word in profanity_variations or matched_word == base_profanity:

	# Seperate pronouns
	for addon in addon_words:
	if base_word.endswith(addon):
	base_profanity = base_profanity + " " + addon
	break

	matched_profanity[base_word] = base_profanity
	break

	return matched_profanity


	def preprocess(tweet, profanities):

	tweet = tweet.lower()
	tweet = emoji.replace_emoji(tweet, replace='')

	# Replace profanities
	for base_word, matched_word in profanities.items():
	tweet = tweet.replace(base_word, matched_word)

	# Elongated words conversion
	tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

	row_split = tweet.split()

	for index, word in enumerate(row_split):

	# Remove links
	if 'http' in word:
	row_split[index] = ''

	# Unify laugh texts format to 'haha'
	laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
	if any(x in word for x in laugh_texts):
	row_split[index] = 'haha'

	# Combine list of words back to sentence
	preprocessed_tweet = ' '.join(filter(None, row_split))

	if len(preprocessed_tweet.split()) == 1:
	return preprocessed_tweet

	# Expand Contractions
	for i in contractions.items():
	preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

	return preprocessed_tweet



	def predict(tweet):

	profanities = fuzzy_lookup(tweet)

	if len(profanities) > 0:

	preprocessed_tweet = preprocess(tweet, profanities)

	prediction = query(preprocessed_tweet)

	if type(prediction) == dict:
	print(prediction)
	error_message = prediction['error']
	return error_message, {}

	prediction = prediction[0][0]["label"]

	print("\nTWEET:", tweet)
	print("PROCESSED TWEET:", preprocessed_tweet)
	print("DETECTED PROFANITY:", list(profanities.keys()))
	print("LABEL:", prediction, "\n")

	return prediction, list(profanities.keys())

	return "No Profanity", {}


	demo = gr.Interface(
	fn=predict,

	inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

	outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")],

	examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
	'Napakainit ngayong araw pakshet namaaan!!',
	'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
	'Bobo ka ba? napakadali lang nyan eh... 🤡',
	'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

	allow_flagging="never",

	title="Tagalog Profanity Classifier"
	)

	demo.launch(debug=True)