Spaces:
Sleeping
Sleeping
import nltk | |
import pandas as pd | |
import re | |
import string | |
from nltk.tokenize import word_tokenize | |
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
def clean_text(text): | |
# remove tab, new line, and back slice | |
text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '') | |
# remove non ASCII (emoticon, Chinese word, etc.) | |
text = text.encode('ascii', 'replace').decode('ascii') | |
# remove mention @ | |
text = re.sub(r"[@][\w_-]+", "", text) | |
# remove link, hashtag | |
text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split()) | |
# remove incomplete URL | |
text = text.replace("http://", " ").replace("https://", " ") | |
# remove number | |
text = re.sub(r"\d+", "", text) | |
# remove punctuation | |
text = text.translate(str.maketrans("", "", string.punctuation)) | |
# remove whitespace leading & trailing | |
text = text.strip() | |
# remove multiple whitespace into single whitespace | |
text = re.sub('\s+', ' ', text) | |
# remove single char | |
text = re.sub(r"\b[a-zA-Z]\b", "", text) | |
# remove symbols | |
text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\|\:]+", "", text)) | |
return text | |
def case_folding(text): | |
return text.lower() | |
def tokenize(text): | |
tokens = word_tokenize(text) | |
return tokens | |
normalizad_word = pd.read_csv("key_norm_1.csv") | |
normalizad_word_dict = {} | |
for row in normalizad_word.itertuples(index=False): | |
if len(row) >= 2: | |
normalizad_word_dict[row[0]] = row[1] | |
else: | |
# Handle the case where the row has less than two elements | |
print(f"Warning: Row {row} has less than two elements.") | |
def normalized_term(document): | |
return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document] | |
list_stopwords = stopwords.words('indonesian') | |
# ---------------------------- manualy add stopword ------------------------------------ | |
# append additional stopword | |
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', | |
'kalo', 'amp', 'biar', 'bikin', 'bilang', | |
'gak', 'ga', 'krn', 'nya', 'nih', 'sih', | |
'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', | |
'jd', 'jgn', 'sdh', 'aja', 'n', 't', | |
'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt', | |
'&', 'yah','gw','lu','lo','gtw','bukan', | |
'iyaa','si','ruarrr','itu','gue','dan','juga', | |
'cm','cmn','emg, hickkkkk']) | |
# ----------------------- add stopword from txt file ------------------------------------ | |
# read txt stopword using pandas | |
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None) | |
# convert stopword string to list & append additional stopword | |
list_stopwords.extend(txt_stopword["stopwords"][0].split(' ')) | |
# --------------------------------------------------------------------------------------- | |
# convert list to dictionary | |
list_stopwords = set(list_stopwords) | |
#remove stopword pada list token | |
def remove_stopwords(words): | |
#kata = [word for word in words if word not in list_stopwords] | |
return [word for word in words if word not in list_stopwords] | |
factory = StemmerFactory() | |
stemmer = factory.create_stemmer() | |
def stem_text(document): | |
# Create a dictionary for unique terms | |
term_dict = {} | |
# Apply stemming to unique terms | |
for term in document: | |
if term not in term_dict: | |
term_dict[term] = stemmer.stem(term) | |
# Apply stemmed term to the document | |
return [term_dict[term] for term in document] |