Spaces:
Runtime error
Runtime error
import re | |
import pandas as pd | |
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory | |
import nltk | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
alay_dict = pd.read_csv('./new_kamusalay.csv', encoding='latin-1', header=None) | |
alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'}) | |
factory = StemmerFactory() | |
stemmer = factory.create_stemmer() | |
def lowercase(text): | |
return text.lower() | |
def remove_unnecessary_char(text): | |
text = re.sub('\n',' ',text) # Remove every '\n' | |
text = re.sub('rt',' ',text) # Remove every retweet symbol | |
text = re.sub('user',' ',text) # Remove every username | |
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL | |
text = re.sub(' +', ' ', text) # Remove extra spaces | |
return text | |
def remove_nonaplhanumeric(text): | |
text = re.sub('[^0-9a-zA-Z]+', ' ', text) | |
return text | |
alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement'])) | |
def normalize_alay(text): | |
return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')]) | |
def remove_stopword(text): | |
list_stopwords = stopwords.words('indonesian') | |
text = ' '.join(['' if word in list_stopwords else word for word in text.split(' ')]) | |
text = re.sub(' +', ' ', text) # Remove extra spaces | |
text = text.strip() | |
return text | |
def stemming(text): | |
return stemmer.stem(text) | |
def preprocess(text): | |
text = lowercase(text) # 1 | |
text = remove_nonaplhanumeric(text) # 2 | |
text = remove_unnecessary_char(text) # 2 | |
text = normalize_alay(text) # 3 | |
text = stemming(text) # 4 | |
text = remove_stopword(text) # 5 | |
return text |