Spaces:
Sleeping
Sleeping
Jasmineavrile
commited on
Commit
•
7e80b79
1
Parent(s):
34342ed
Upload preprocessing.py
Browse files- preprocessing.py +104 -0
preprocessing.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import pandas as pd
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
from nltk.tokenize import word_tokenize
|
6 |
+
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
nltk.download('stopwords')
|
9 |
+
nltk.download('punkt')
|
10 |
+
|
11 |
+
def clean_text(text):
|
12 |
+
# remove tab, new line, and back slice
|
13 |
+
text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '')
|
14 |
+
# remove non ASCII (emoticon, Chinese word, etc.)
|
15 |
+
text = text.encode('ascii', 'replace').decode('ascii')
|
16 |
+
# remove mention @
|
17 |
+
text = re.sub(r"[@][\w_-]+", "", text)
|
18 |
+
# remove link, hashtag
|
19 |
+
text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
|
20 |
+
# remove incomplete URL
|
21 |
+
text = text.replace("http://", " ").replace("https://", " ")
|
22 |
+
# remove number
|
23 |
+
text = re.sub(r"\d+", "", text)
|
24 |
+
# remove punctuation
|
25 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
26 |
+
# remove whitespace leading & trailing
|
27 |
+
text = text.strip()
|
28 |
+
# remove multiple whitespace into single whitespace
|
29 |
+
text = re.sub('\s+', ' ', text)
|
30 |
+
# remove single char
|
31 |
+
text = re.sub(r"\b[a-zA-Z]\b", "", text)
|
32 |
+
# remove symbols
|
33 |
+
text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\|\:]+", "", text))
|
34 |
+
|
35 |
+
return text
|
36 |
+
|
37 |
+
def case_folding(text):
|
38 |
+
return text.lower()
|
39 |
+
|
40 |
+
def tokenize(text):
|
41 |
+
tokens = word_tokenize(text)
|
42 |
+
return tokens
|
43 |
+
|
44 |
+
normalizad_word = pd.read_csv("key_norm_1.csv")
|
45 |
+
normalizad_word_dict = {}
|
46 |
+
|
47 |
+
for row in normalizad_word.itertuples(index=False):
|
48 |
+
if len(row) >= 2:
|
49 |
+
normalizad_word_dict[row[0]] = row[1]
|
50 |
+
else:
|
51 |
+
# Handle the case where the row has less than two elements
|
52 |
+
print(f"Warning: Row {row} has less than two elements.")
|
53 |
+
|
54 |
+
|
55 |
+
def normalized_term(document):
|
56 |
+
return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]
|
57 |
+
|
58 |
+
|
59 |
+
list_stopwords = stopwords.words('indonesian')
|
60 |
+
|
61 |
+
# ---------------------------- manualy add stopword ------------------------------------
|
62 |
+
# append additional stopword
|
63 |
+
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
|
64 |
+
'kalo', 'amp', 'biar', 'bikin', 'bilang',
|
65 |
+
'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
|
66 |
+
'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
|
67 |
+
'jd', 'jgn', 'sdh', 'aja', 'n', 't',
|
68 |
+
'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
|
69 |
+
'&', 'yah','gw','lu','lo','gtw','bukan',
|
70 |
+
'iyaa','si','ruarrr','itu','gue','dan','juga',
|
71 |
+
'cm','cmn','emg, hickkkkk'])
|
72 |
+
|
73 |
+
# ----------------------- add stopword from txt file ------------------------------------
|
74 |
+
# read txt stopword using pandas
|
75 |
+
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)
|
76 |
+
|
77 |
+
# convert stopword string to list & append additional stopword
|
78 |
+
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
|
79 |
+
|
80 |
+
# ---------------------------------------------------------------------------------------
|
81 |
+
|
82 |
+
# convert list to dictionary
|
83 |
+
list_stopwords = set(list_stopwords)
|
84 |
+
|
85 |
+
|
86 |
+
#remove stopword pada list token
|
87 |
+
def remove_stopwords(words):
|
88 |
+
#kata = [word for word in words if word not in list_stopwords]
|
89 |
+
return [word for word in words if word not in list_stopwords]
|
90 |
+
|
91 |
+
factory = StemmerFactory()
|
92 |
+
stemmer = factory.create_stemmer()
|
93 |
+
|
94 |
+
def stem_text(document):
|
95 |
+
# Create a dictionary for unique terms
|
96 |
+
term_dict = {}
|
97 |
+
|
98 |
+
# Apply stemming to unique terms
|
99 |
+
for term in document:
|
100 |
+
if term not in term_dict:
|
101 |
+
term_dict[term] = stemmer.stem(term)
|
102 |
+
|
103 |
+
# Apply stemmed term to the document
|
104 |
+
return [term_dict[term] for term in document]
|