kompiangg's picture
blabla
6460fef
raw
history blame contribute delete
No virus
1.74 kB
import re
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
from nltk.corpus import stopwords
import pathlib
nltk.download('stopwords')
alay_dict = pd.read_csv(pathlib.Path('new_kamusalay.csv').resolve(), encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def lowercase(text):
return text.lower()
def remove_unnecessary_char(text):
text = re.sub('\n',' ',text) # Remove every '\n'
text = re.sub('rt',' ',text) # Remove every retweet symbol
text = re.sub('user',' ',text) # Remove every username
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
text = re.sub(' +', ' ', text) # Remove extra spaces
return text
def remove_nonaplhanumeric(text):
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
return text
alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])
def remove_stopword(text):
list_stopwords = stopwords.words('indonesian')
text = ' '.join(['' if word in list_stopwords else word for word in text.split(' ')])
text = re.sub(' +', ' ', text) # Remove extra spaces
text = text.strip()
return text
def stemming(text):
return stemmer.stem(text)
def preprocessing(text):
text = lowercase(text) # 1
text = remove_nonaplhanumeric(text) # 2
text = remove_unnecessary_char(text) # 2
text = normalize_alay(text) # 3
text = stemming(text) # 4
text = remove_stopword(text) # 5
return text