import re import pandas as pd from Sastrawi.Stemmer.StemmerFactory import StemmerFactory import nltk from nltk.corpus import stopwords import pathlib nltk.download('stopwords') alay_dict = pd.read_csv(pathlib.Path('new_kamusalay.csv').resolve(), encoding='latin-1', header=None) alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'}) factory = StemmerFactory() stemmer = factory.create_stemmer() def lowercase(text): return text.lower() def remove_unnecessary_char(text): text = re.sub('\n',' ',text) # Remove every '\n' text = re.sub('rt',' ',text) # Remove every retweet symbol text = re.sub('user',' ',text) # Remove every username text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL text = re.sub(' +', ' ', text) # Remove extra spaces return text def remove_nonaplhanumeric(text): text = re.sub('[^0-9a-zA-Z]+', ' ', text) return text alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement'])) def normalize_alay(text): return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')]) def remove_stopword(text): list_stopwords = stopwords.words('indonesian') text = ' '.join(['' if word in list_stopwords else word for word in text.split(' ')]) text = re.sub(' +', ' ', text) # Remove extra spaces text = text.strip() return text def stemming(text): return stemmer.stem(text) def preprocessing(text): text = lowercase(text) # 1 text = remove_nonaplhanumeric(text) # 2 text = remove_unnecessary_char(text) # 2 text = normalize_alay(text) # 3 text = stemming(text) # 4 text = remove_stopword(text) # 5 return text