Jasmineavrile commited on
Commit
7e80b79
1 Parent(s): 34342ed

Upload preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +104 -0
preprocessing.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import pandas as pd
3
+ import re
4
+ import string
5
+ from nltk.tokenize import word_tokenize
6
+ from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
7
+ from nltk.corpus import stopwords
8
+ nltk.download('stopwords')
9
+ nltk.download('punkt')
10
+
11
+ def clean_text(text):
12
+ # remove tab, new line, and back slice
13
+ text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '')
14
+ # remove non ASCII (emoticon, Chinese word, etc.)
15
+ text = text.encode('ascii', 'replace').decode('ascii')
16
+ # remove mention @
17
+ text = re.sub(r"[@][\w_-]+", "", text)
18
+ # remove link, hashtag
19
+ text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
20
+ # remove incomplete URL
21
+ text = text.replace("http://", " ").replace("https://", " ")
22
+ # remove number
23
+ text = re.sub(r"\d+", "", text)
24
+ # remove punctuation
25
+ text = text.translate(str.maketrans("", "", string.punctuation))
26
+ # remove whitespace leading & trailing
27
+ text = text.strip()
28
+ # remove multiple whitespace into single whitespace
29
+ text = re.sub('\s+', ' ', text)
30
+ # remove single char
31
+ text = re.sub(r"\b[a-zA-Z]\b", "", text)
32
+ # remove symbols
33
+ text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\|\:]+", "", text))
34
+
35
+ return text
36
+
37
+ def case_folding(text):
38
+ return text.lower()
39
+
40
+ def tokenize(text):
41
+ tokens = word_tokenize(text)
42
+ return tokens
43
+
44
+ normalizad_word = pd.read_csv("key_norm_1.csv")
45
+ normalizad_word_dict = {}
46
+
47
+ for row in normalizad_word.itertuples(index=False):
48
+ if len(row) >= 2:
49
+ normalizad_word_dict[row[0]] = row[1]
50
+ else:
51
+ # Handle the case where the row has less than two elements
52
+ print(f"Warning: Row {row} has less than two elements.")
53
+
54
+
55
+ def normalized_term(document):
56
+ return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]
57
+
58
+
59
+ list_stopwords = stopwords.words('indonesian')
60
+
61
+ # ---------------------------- manualy add stopword ------------------------------------
62
+ # append additional stopword
63
+ list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
64
+ 'kalo', 'amp', 'biar', 'bikin', 'bilang',
65
+ 'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
66
+ 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
67
+ 'jd', 'jgn', 'sdh', 'aja', 'n', 't',
68
+ 'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
69
+ '&amp', 'yah','gw','lu','lo','gtw','bukan',
70
+ 'iyaa','si','ruarrr','itu','gue','dan','juga',
71
+ 'cm','cmn','emg, hickkkkk'])
72
+
73
+ # ----------------------- add stopword from txt file ------------------------------------
74
+ # read txt stopword using pandas
75
+ txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)
76
+
77
+ # convert stopword string to list & append additional stopword
78
+ list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
79
+
80
+ # ---------------------------------------------------------------------------------------
81
+
82
+ # convert list to dictionary
83
+ list_stopwords = set(list_stopwords)
84
+
85
+
86
+ #remove stopword pada list token
87
+ def remove_stopwords(words):
88
+ #kata = [word for word in words if word not in list_stopwords]
89
+ return [word for word in words if word not in list_stopwords]
90
+
91
+ factory = StemmerFactory()
92
+ stemmer = factory.create_stemmer()
93
+
94
+ def stem_text(document):
95
+ # Create a dictionary for unique terms
96
+ term_dict = {}
97
+
98
+ # Apply stemming to unique terms
99
+ for term in document:
100
+ if term not in term_dict:
101
+ term_dict[term] = stemmer.stem(term)
102
+
103
+ # Apply stemmed term to the document
104
+ return [term_dict[term] for term in document]