import pandas as pd import re import nltk from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer from nltk.tokenize import RegexpTokenizer import nltk nltk.download('wordnet') nltk.download('stopwords') from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer nltk.download('punkt') from imblearn.over_sampling import SMOTE from sklearn.model_selection import train_test_split import xgboost as xgb from PIL import Image import warnings warnings.filterwarnings('ignore') data = pd.read_csv('chatgpt_reviews.csv') data['complete_review'] = data['title'] +' .'+data['review'] data = data.drop(['date','review','title'],axis='columns') def preprocess_data(text): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) special_char_removal = re.compile(r'[^a-z\d\s]+', re.IGNORECASE) text = text.lower() text = emoji_pattern.sub('', text) text = special_char_removal.sub('', text) return text data['complete_review'] = data['complete_review'].apply(lambda x: preprocess_data(x)) stop = stopwords.words('english') data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) lemmatizer = WordNetLemmatizer() def lemmatize_text(text): tokens = word_tokenize(text) # Tokenisasi teks menjadi kata-kata lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens] # Melakukan lemmatization pada setiap token lemmatized_text = ' '.join(lemmatized_tokens) # Menggabungkan kembali kata-kata menjadi teks return lemmatized_text data['complete_review']= data['complete_review'].apply(lemmatize_text) words_to_remove = ['chatgpt','app','chatgpts','chat','gpt','iphone','ipad','gpt4','phone','number','ai','use','io'] data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove)) data['sentiment'] = data['rating'].apply(lambda rating: 2 if rating > 3 else (1 if rating == 3 else 0)) data['sentiment'].value_counts(normalize=True).mul(100).round(2) stopword = set(stopwords.words('english')) #Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer token = RegexpTokenizer(r'[a-zA-Z0-9]+') cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize) X = cv.fit_transform(data['complete_review']) y = data['sentiment'] smote = SMOTE() X_oversampled, y_oversampled = smote.fit_resample(X, y) X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.15, random_state=17,stratify=y_oversampled) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) params = { 'objective': 'multi:softmax', 'num_class': 3, 'eval_metric': 'merror', 'eta': 0.4, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42 } num_rounds = 100 model = xgb.train(params, dtrain, num_rounds) preds = model.predict(dtest) pred_labels = [int(pred) for pred in preds] def predict(kata): preprocessed_kata = preprocess_data(kata) cv_fit = cv.fit(data['complete_review']) X_pred = cv_fit.transform(pd.Series([preprocessed_kata])) dmatrix = xgb.DMatrix(X_pred) preds = model.predict(dmatrix) return preds[0]