File size: 3,788 Bytes
7e80b79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import nltk
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
    # remove tab, new line, and back slice
    text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '')
    # remove non ASCII (emoticon, Chinese word, etc.)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention @
    text = re.sub(r"[@][\w_-]+", "", text)
    # remove link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
    # remove incomplete URL
    text = text.replace("http://", " ").replace("https://", " ")
    # remove number
    text = re.sub(r"\d+", "", text)
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # remove whitespace leading & trailing
    text = text.strip()
    # remove multiple whitespace into single whitespace
    text = re.sub('\s+', ' ', text)
    # remove single char
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    # remove symbols
    text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\|\:]+", "", text))

    return text

def case_folding(text):
    return text.lower()

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

normalizad_word = pd.read_csv("key_norm_1.csv")
normalizad_word_dict = {}

for row in normalizad_word.itertuples(index=False):
    if len(row) >= 2:
        normalizad_word_dict[row[0]] = row[1]
    else:
        # Handle the case where the row has less than two elements
        print(f"Warning: Row {row} has less than two elements.")


def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]


list_stopwords = stopwords.words('indonesian')

# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
                       'kalo', 'amp', 'biar', 'bikin', 'bilang',
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't',
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah','gw','lu','lo','gtw','bukan',
                       'iyaa','si','ruarrr','itu','gue','dan','juga',
                       'cm','cmn','emg, hickkkkk'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def remove_stopwords(words):
    #kata = [word for word in words if word not in list_stopwords]
    return [word for word in words if word not in list_stopwords]

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(document):
    # Create a dictionary for unique terms
    term_dict = {}

    # Apply stemming to unique terms
    for term in document:
        if term not in term_dict:
            term_dict[term] = stemmer.stem(term)

    # Apply stemmed term to the document
    return [term_dict[term] for term in document]