import nltk import pandas as pd import re import string from nltk.tokenize import word_tokenize from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from nltk.corpus import stopwords nltk.download('stopwords') nltk.download('punkt') def clean_text(text): # remove tab, new line, and back slice text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '') # remove non ASCII (emoticon, Chinese word, etc.) text = text.encode('ascii', 'replace').decode('ascii') # remove mention @ text = re.sub(r"[@][\w_-]+", "", text) # remove link, hashtag text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split()) # remove incomplete URL text = text.replace("http://", " ").replace("https://", " ") # remove number text = re.sub(r"\d+", "", text) # remove punctuation text = text.translate(str.maketrans("", "", string.punctuation)) # remove whitespace leading & trailing text = text.strip() # remove multiple whitespace into single whitespace text = re.sub('\s+', ' ', text) # remove single char text = re.sub(r"\b[a-zA-Z]\b", "", text) # remove symbols text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\|\:]+", "", text)) return text def case_folding(text): return text.lower() def tokenize(text): tokens = word_tokenize(text) return tokens normalizad_word = pd.read_csv("key_norm_1.csv") normalizad_word_dict = {} for row in normalizad_word.itertuples(index=False): if len(row) >= 2: normalizad_word_dict[row[0]] = row[1] else: # Handle the case where the row has less than two elements print(f"Warning: Row {row} has less than two elements.") def normalized_term(document): return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document] list_stopwords = stopwords.words('indonesian') # ---------------------------- manualy add stopword ------------------------------------ # append additional stopword list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar', 'bikin', 'bilang', 'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 'jd', 'jgn', 'sdh', 'aja', 'n', 't', 'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt', '&', 'yah','gw','lu','lo','gtw','bukan', 'iyaa','si','ruarrr','itu','gue','dan','juga', 'cm','cmn','emg, hickkkkk']) # ----------------------- add stopword from txt file ------------------------------------ # read txt stopword using pandas txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None) # convert stopword string to list & append additional stopword list_stopwords.extend(txt_stopword["stopwords"][0].split(' ')) # --------------------------------------------------------------------------------------- # convert list to dictionary list_stopwords = set(list_stopwords) #remove stopword pada list token def remove_stopwords(words): #kata = [word for word in words if word not in list_stopwords] return [word for word in words if word not in list_stopwords] factory = StemmerFactory() stemmer = factory.create_stemmer() def stem_text(document): # Create a dictionary for unique terms term_dict = {} # Apply stemming to unique terms for term in document: if term not in term_dict: term_dict[term] = stemmer.stem(term) # Apply stemmed term to the document return [term_dict[term] for term in document]