Skripsi / preprocessing.py
Jasmineavrile's picture
Upload preprocessing.py
7e80b79 verified
raw
history blame contribute delete
No virus
3.79 kB
import nltk
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
def clean_text(text):
# remove tab, new line, and back slice
text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '')
# remove non ASCII (emoticon, Chinese word, etc.)
text = text.encode('ascii', 'replace').decode('ascii')
# remove mention @
text = re.sub(r"[@][\w_-]+", "", text)
# remove link, hashtag
text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
# remove incomplete URL
text = text.replace("http://", " ").replace("https://", " ")
# remove number
text = re.sub(r"\d+", "", text)
# remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# remove whitespace leading & trailing
text = text.strip()
# remove multiple whitespace into single whitespace
text = re.sub('\s+', ' ', text)
# remove single char
text = re.sub(r"\b[a-zA-Z]\b", "", text)
# remove symbols
text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\|\:]+", "", text))
return text
def case_folding(text):
return text.lower()
def tokenize(text):
tokens = word_tokenize(text)
return tokens
normalizad_word = pd.read_csv("key_norm_1.csv")
normalizad_word_dict = {}
for row in normalizad_word.itertuples(index=False):
if len(row) >= 2:
normalizad_word_dict[row[0]] = row[1]
else:
# Handle the case where the row has less than two elements
print(f"Warning: Row {row} has less than two elements.")
def normalized_term(document):
return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]
list_stopwords = stopwords.words('indonesian')
# ---------------------------- manualy add stopword ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
'kalo', 'amp', 'biar', 'bikin', 'bilang',
'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
'jd', 'jgn', 'sdh', 'aja', 'n', 't',
'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
'&amp', 'yah','gw','lu','lo','gtw','bukan',
'iyaa','si','ruarrr','itu','gue','dan','juga',
'cm','cmn','emg, hickkkkk'])
# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)
# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
# ---------------------------------------------------------------------------------------
# convert list to dictionary
list_stopwords = set(list_stopwords)
#remove stopword pada list token
def remove_stopwords(words):
#kata = [word for word in words if word not in list_stopwords]
return [word for word in words if word not in list_stopwords]
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stem_text(document):
# Create a dictionary for unique terms
term_dict = {}
# Apply stemming to unique terms
for term in document:
if term not in term_dict:
term_dict[term] = stemmer.stem(term)
# Apply stemmed term to the document
return [term_dict[term] for term in document]