import re import string import nltk from nltk.tokenize import word_tokenize import pandas as pd import numpy as np from nltk.corpus import stopwords import re import string from nltk.tokenize import word_tokenize from nltk.stem.porter import * nltk.download('wordnet') nltk.download('punkt') # A Function to use in the dataframe kamus = pd.read_csv('kamus.txt', sep=" ", header=None,names=['slang', 'fix']) slang_list = kamus['slang'].tolist() fix_list = kamus['fix'].tolist() def TextProcess(text): # 1. Change all text to Lowercase text = text.lower() # 2. Removing Mentions text = re.sub("@[A-Za-z0-9_]+", " ", text) # 3. Removing Hashtags text = re.sub("#[A-Za-z0-9_]+", " ", text) # 4. Removing \n text = re.sub(r"\\n", " ",text) # 5. Removing Whitespaces text = text.strip() # 6. Removing Links text = re.sub(r"http\S+", " ", text) text = re.sub(r"www.\S+", " ", text) # 7. Removing non text characters such as Emojis, Mathematical symbols text = re.sub("[^A-Za-z\s']", " ", text) # 8. Removing RT text = re.sub("rt", " ",text) # 9. Removing Punctuations text = text.translate(str.maketrans('', '', string.punctuation)) # 11. Tokenization token = word_tokenize(text) for x in range(len(token)): for i in range(len(slang_list)): if token[x] == slang_list[i]: token[x] = fix_list[i] else: pass from nltk.util import ngrams _2gram = [' '.join(e) for e in ngrams(token, 2)] _3gram = [' '.join(e) for e in ngrams(token, 3)] text = token + _2gram + _3gram #text2 = ' '.join(token) text = np.array(text) return text def TextProcess2(text): # 1. Change all text to Lowercase text = text.lower() # 2. Removing Mentions text = re.sub("@[A-Za-z0-9_]+", " ", text) # 3. Removing Hashtags text = re.sub("#[A-Za-z0-9_]+", " ", text) # 4. Removing \n text = re.sub(r"\\n", " ",text) # 5. Removing Whitespaces text = text.strip() # 6. Removing Links text = re.sub(r"http\S+", " ", text) text = re.sub(r"www.\S+", " ", text) # 7. Removing non text characters such as Emojis, Mathematical symbols text = re.sub("[^A-Za-z\s']", " ", text) # 8. Removing RT text = re.sub("rt", " ",text) # 9. Tokenization tokens = word_tokenize(text) for x in range(len(tokens)): for i in range(len(slang_list)): if tokens[x] == slang_list[i]: tokens[x] = fix_list[i] else: pass # 10. Removing Stopwords # stopwords_en = list(set(stopwords.words('english'))) text = ' '.join([word for word in tokens]) # 11. Stemming stemmer = PorterStemmer() text = stemmer.stem(text) return text def Label(num): if num == 0: topic = 'Baterai cepat habis' elif num == 1: topic = 'hp tidak berfungsi, tidak sesuai, tidak nyala' elif num == 2: topic = 'barang tidak sesuai deskripsi, hp mati' elif num == 3: topic = 'positif' elif num == 4: topic = 'barang tidak sesuai pesanan' elif num == 5: topic = 'barang rusak' elif num == 6: topic = 'barang tidak sesuai, suara tidak berfungsi' elif num == 7: topic = 'warna tidak sesuai, atau barang tidak sesuai gambar' elif num == 8: topic = 'barang tidak sesuai deskripsi, pengiriman lama' elif num == 9: topic = 'barang kosong, cancel, retur' else: pass return topic