Spaces:

kompiangg
/

hate-speech-classification

Runtime error

App Files Files Community

hate-speech-classification / preprocess /preprocess.py

kompiangg

preprocessssssss

ae8ae26 11 months ago

raw

history blame

1.7 kB

	import re
	import pandas as pd
	from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
	import nltk
	from nltk.corpus import stopwords

	nltk.download('stopwords')
	alay_dict = pd.read_csv('./new_kamusalay.csv', encoding='latin-1', header=None)
	alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})

	factory = StemmerFactory()
	stemmer = factory.create_stemmer()

	def lowercase(text):
	return text.lower()

	def remove_unnecessary_char(text):
	text = re.sub('\n',' ',text) # Remove every '\n'
	text = re.sub('rt',' ',text) # Remove every retweet symbol
	text = re.sub('user',' ',text) # Remove every username
	text = re.sub('((www\.[^\s]+)\|(https?://[^\s]+)\|(http?://[^\s]+))',' ',text) # Remove every URL
	text = re.sub(' +', ' ', text) # Remove extra spaces
	return text

	def remove_nonaplhanumeric(text):
	text = re.sub('[^0-9a-zA-Z]+', ' ', text)
	return text

	alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
	def normalize_alay(text):
	return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

	def remove_stopword(text):
	list_stopwords = stopwords.words('indonesian')
	text = ' '.join(['' if word in list_stopwords else word for word in text.split(' ')])

	text = re.sub(' +', ' ', text) # Remove extra spaces
	text = text.strip()
	return text

	def stemming(text):
	return stemmer.stem(text)

	def preprocess(text):
	text = lowercase(text) # 1
	text = remove_nonaplhanumeric(text) # 2
	text = remove_unnecessary_char(text) # 2
	text = normalize_alay(text) # 3
	text = stemming(text) # 4
	text = remove_stopword(text) # 5
	return text