Spaces:

kaz9112
/

RepiuBOSS

Sleeping

App Files Files Community

RepiuBOSS / func.py

kaz9112

fix add nltk download

2cc611e almost 2 years ago

raw

history blame contribute delete

No virus

3.37 kB

	import re
	import string
	import nltk
	from nltk.tokenize import word_tokenize
	import pandas as pd
	import numpy as np
	from nltk.corpus import stopwords
	import re
	import string
	from nltk.tokenize import word_tokenize
	from nltk.stem.porter import *

	nltk.download('wordnet')
	nltk.download('punkt')

	# A Function to use in the dataframe
	kamus = pd.read_csv('kamus.txt', sep=" ", header=None,names=['slang', 'fix'])
	slang_list = kamus['slang'].tolist()
	fix_list = kamus['fix'].tolist()

	def TextProcess(text):

	# 1. Change all text to Lowercase
	text = text.lower()

	# 2. Removing Mentions
	text = re.sub("@[A-Za-z0-9_]+", " ", text)

	# 3. Removing Hashtags
	text = re.sub("#[A-Za-z0-9_]+", " ", text)

	# 4. Removing \n
	text = re.sub(r"\\n", " ",text)

	# 5. Removing Whitespaces
	text = text.strip()

	# 6. Removing Links
	text = re.sub(r"http\S+", " ", text)
	text = re.sub(r"www.\S+", " ", text)

	# 7. Removing non text characters such as Emojis, Mathematical symbols
	text = re.sub("[^A-Za-z\s']", " ", text)

	# 8. Removing RT
	text = re.sub("rt", " ",text)

	# 9. Removing Punctuations
	text = text.translate(str.maketrans('', '', string.punctuation))

	# 11. Tokenization
	token = word_tokenize(text)

	for x in range(len(token)):
	for i in range(len(slang_list)):
	if token[x] == slang_list[i]:
	token[x] = fix_list[i]
	else:
	pass

	from nltk.util import ngrams
	_2gram = [' '.join(e) for e in ngrams(token, 2)]
	_3gram = [' '.join(e) for e in ngrams(token, 3)]
	text = token + _2gram + _3gram


	#text2 = ' '.join(token)

	text = np.array(text)


	return text

	def TextProcess2(text):

	# 1. Change all text to Lowercase
	text = text.lower()

	# 2. Removing Mentions
	text = re.sub("@[A-Za-z0-9_]+", " ", text)

	# 3. Removing Hashtags
	text = re.sub("#[A-Za-z0-9_]+", " ", text)

	# 4. Removing \n
	text = re.sub(r"\\n", " ",text)

	# 5. Removing Whitespaces
	text = text.strip()

	# 6. Removing Links
	text = re.sub(r"http\S+", " ", text)
	text = re.sub(r"www.\S+", " ", text)

	# 7. Removing non text characters such as Emojis, Mathematical symbols
	text = re.sub("[^A-Za-z\s']", " ", text)

	# 8. Removing RT
	text = re.sub("rt", " ",text)

	# 9. Tokenization
	tokens = word_tokenize(text)
	for x in range(len(tokens)):
	for i in range(len(slang_list)):
	if tokens[x] == slang_list[i]:
	tokens[x] = fix_list[i]
	else:
	pass

	# 10. Removing Stopwords
	# stopwords_en = list(set(stopwords.words('english')))
	text = ' '.join([word for word in tokens])

	# 11. Stemming
	stemmer = PorterStemmer()
	text = stemmer.stem(text)

	return text

	def Label(num):
	if num == 0:
	topic = 'Baterai cepat habis'
	elif num == 1:
	topic = 'hp tidak berfungsi, tidak sesuai, tidak nyala'
	elif num == 2:
	topic = 'barang tidak sesuai deskripsi, hp mati'
	elif num == 3:
	topic = 'positif'
	elif num == 4:
	topic = 'barang tidak sesuai pesanan'
	elif num == 5:
	topic = 'barang rusak'
	elif num == 6:
	topic = 'barang tidak sesuai, suara tidak berfungsi'
	elif num == 7:
	topic = 'warna tidak sesuai, atau barang tidak sesuai gambar'
	elif num == 8:
	topic = 'barang tidak sesuai deskripsi, pengiriman lama'
	elif num == 9:
	topic = 'barang kosong, cancel, retur'
	else:
	pass
	return topic