Spaces:

Jasmineavrile
/

Skripsi

Sleeping

App Files Files Community

Skripsi / preprocessing.py

Jasmineavrile

Upload preprocessing.py

7e80b79 verified 5 months ago

raw

history blame contribute delete

No virus

3.79 kB

	import nltk
	import pandas as pd
	import re
	import string
	from nltk.tokenize import word_tokenize
	from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
	from nltk.corpus import stopwords
	nltk.download('stopwords')
	nltk.download('punkt')

	def clean_text(text):
	# remove tab, new line, and back slice
	text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '')
	# remove non ASCII (emoticon, Chinese word, etc.)
	text = text.encode('ascii', 'replace').decode('ascii')
	# remove mention @
	text = re.sub(r"[@][\w_-]+", "", text)
	# remove link, hashtag
	text = ' '.join(re.sub("([@#][A-Za-z0-9]+)\|(\w+:\/\/\S+)", " ", text).split())
	# remove incomplete URL
	text = text.replace("http://", " ").replace("https://", " ")
	# remove number
	text = re.sub(r"\d+", "", text)
	# remove punctuation
	text = text.translate(str.maketrans("", "", string.punctuation))
	# remove whitespace leading & trailing
	text = text.strip()
	# remove multiple whitespace into single whitespace
	text = re.sub('\s+', ' ', text)
	# remove single char
	text = re.sub(r"\b[a-zA-Z]\b", "", text)
	# remove symbols
	text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\\|\:]+", "", text))

	return text

	def case_folding(text):
	return text.lower()

	def tokenize(text):
	tokens = word_tokenize(text)
	return tokens

	normalizad_word = pd.read_csv("key_norm_1.csv")
	normalizad_word_dict = {}

	for row in normalizad_word.itertuples(index=False):
	if len(row) >= 2:
	normalizad_word_dict[row[0]] = row[1]
	else:
	# Handle the case where the row has less than two elements
	print(f"Warning: Row {row} has less than two elements.")


	def normalized_term(document):
	return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]


	list_stopwords = stopwords.words('indonesian')

	# ---------------------------- manualy add stopword ------------------------------------
	# append additional stopword
	list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
	'kalo', 'amp', 'biar', 'bikin', 'bilang',
	'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
	'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
	'jd', 'jgn', 'sdh', 'aja', 'n', 't',
	'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
	'&amp', 'yah','gw','lu','lo','gtw','bukan',
	'iyaa','si','ruarrr','itu','gue','dan','juga',
	'cm','cmn','emg, hickkkkk'])

	# ----------------------- add stopword from txt file ------------------------------------
	# read txt stopword using pandas
	txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

	# convert stopword string to list & append additional stopword
	list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

	# ---------------------------------------------------------------------------------------

	# convert list to dictionary
	list_stopwords = set(list_stopwords)


	#remove stopword pada list token
	def remove_stopwords(words):
	#kata = [word for word in words if word not in list_stopwords]
	return [word for word in words if word not in list_stopwords]

	factory = StemmerFactory()
	stemmer = factory.create_stemmer()

	def stem_text(document):
	# Create a dictionary for unique terms
	term_dict = {}

	# Apply stemming to unique terms
	for term in document:
	if term not in term_dict:
	term_dict[term] = stemmer.stem(term)

	# Apply stemmed term to the document
	return [term_dict[term] for term in document]