Spaces:

kompiangg
/

hate-speech-classification

Runtime error

App Files Files Community

hate-speech-classification / pipeline /pipeline.py

kompiangg

blabla

6460fef 9 months ago

raw

history blame contribute delete

No virus

1.74 kB

	import re
	import pandas as pd
	from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
	import nltk
	from nltk.corpus import stopwords
	import pathlib

	nltk.download('stopwords')
	alay_dict = pd.read_csv(pathlib.Path('new_kamusalay.csv').resolve(), encoding='latin-1', header=None)
	alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})

	factory = StemmerFactory()
	stemmer = factory.create_stemmer()

	def lowercase(text):
	return text.lower()

	def remove_unnecessary_char(text):
	text = re.sub('\n',' ',text) # Remove every '\n'
	text = re.sub('rt',' ',text) # Remove every retweet symbol
	text = re.sub('user',' ',text) # Remove every username
	text = re.sub('((www\.[^\s]+)\|(https?://[^\s]+)\|(http?://[^\s]+))',' ',text) # Remove every URL
	text = re.sub(' +', ' ', text) # Remove extra spaces
	return text

	def remove_nonaplhanumeric(text):
	text = re.sub('[^0-9a-zA-Z]+', ' ', text)
	return text

	alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
	def normalize_alay(text):
	return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

	def remove_stopword(text):
	list_stopwords = stopwords.words('indonesian')
	text = ' '.join(['' if word in list_stopwords else word for word in text.split(' ')])

	text = re.sub(' +', ' ', text) # Remove extra spaces
	text = text.strip()
	return text

	def stemming(text):
	return stemmer.stem(text)

	def preprocessing(text):
	text = lowercase(text) # 1
	text = remove_nonaplhanumeric(text) # 2
	text = remove_unnecessary_char(text) # 2
	text = normalize_alay(text) # 3
	text = stemming(text) # 4
	text = remove_stopword(text) # 5
	return text