Spaces:

Jasmineavrile
/

Skripsi

Sleeping

File size: 3,788 Bytes

7e80b79

import nltk
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
    # remove tab, new line, and back slice
    text = text.replace('\\t', ' ').replace('\\n', ' ').replace('\\u', ' ').replace('\\', '')
    # remove non ASCII (emoticon, Chinese word, etc.)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention @
    text = re.sub(r"[@][\w_-]+", "", text)
    # remove link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
    # remove incomplete URL
    text = text.replace("http://", " ").replace("https://", " ")
    # remove number
    text = re.sub(r"\d+", "", text)
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # remove whitespace leading & trailing
    text = text.strip()
    # remove multiple whitespace into single whitespace
    text = re.sub('\s+', ' ', text)
    # remove single char
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    # remove symbols
    text = ''.join(re.sub(r"[\!\@\#$\%\^\&\*\?\,\"\|\:]+", "", text))

    return text

def case_folding(text):
    return text.lower()

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

normalizad_word = pd.read_csv("key_norm_1.csv")
normalizad_word_dict = {}

for row in normalizad_word.itertuples(index=False):
    if len(row) >= 2:
        normalizad_word_dict[row[0]] = row[1]
    else:
        # Handle the case where the row has less than two elements
        print(f"Warning: Row {row} has less than two elements.")


def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]


list_stopwords = stopwords.words('indonesian')

# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
                       'kalo', 'amp', 'biar', 'bikin', 'bilang',
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't',
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah','gw','lu','lo','gtw','bukan',
                       'iyaa','si','ruarrr','itu','gue','dan','juga',
                       'cm','cmn','emg, hickkkkk'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def remove_stopwords(words):
    #kata = [word for word in words if word not in list_stopwords]
    return [word for word in words if word not in list_stopwords]

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(document):
    # Create a dictionary for unique terms
    term_dict = {}

    # Apply stemming to unique terms
    for term in document:
        if term not in term_dict:
            term_dict[term] = stemmer.stem(term)

    # Apply stemmed term to the document
    return [term_dict[term] for term in document]