import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import spacy import json,os,uuid import re import nltk from nltk.corpus import stopwords from wordcloud import WordCloud, STOPWORDS from sklearn.feature_extraction.text import CountVectorizer from nltk.tokenize import RegexpTokenizer from imblearn.over_sampling import SMOTE from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,classification_report import xgboost as xgb from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt from PIL import Image import warnings warnings.filterwarnings('ignore') nltk.download('stopwords') nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner']) data = pd.read_csv('chatgpt_reviews.csv') data.head() data.info() data.describe() data.describe(include='object') """

Analysis of Rating column

""" data['rating'].value_counts().sort_index() data['rating'].value_counts(normalize=True).mul(100).round(2).sort_index() #Plot palette = "deep" sns.set_palette(palette) sns.countplot(data=data, x='rating') plt.xlabel('Rating') plt.ylabel('No. of Users') plt.title('Ratings Distribution') plt.show() """Preprocessing""" #Find no. of missing values in each column data.isnull().sum().sort_values(ascending=False) #Combine Review Time and Review data['complete_review'] = data['title'] +' .'+data['review'] data = data.drop(['date','review','title'],axis='columns') data.head() def preprocess_data(text): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) special_char_removal = re.compile(r'[^a-z\d\s]+', re.IGNORECASE) text = text.lower() text = emoji_pattern.sub('', text) text = special_char_removal.sub('', text) return text data['complete_review'] = data['complete_review'].apply(lambda x: preprocess_data(x)) data['complete_review'].head() preprocess_data("Hallo, My name") """hapus stopwords""" stop = stopwords.words('english') data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) """Lemmatization""" def space(comment): doc = nlp(comment) return " ".join([token.lemma_ for token in doc]) data['complete_review']= data['complete_review'].apply(space) """menghapus spesifik kata""" words_to_remove = ['chatgpt','app','chatgpts','chat','gpt','iphone','ipad','gpt4','phone','number','ai','use','io'] data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove)) data['sentiment'] = data['rating'].apply(lambda rating: 1 if rating > 3 else 0) data.head(5) data['sentiment'].value_counts(normalize=True).mul(100).round(2) """Data is Imbalanced as about 66% of sentiment is positive, 24% is negative and 9.5% is neutral. # Reviews Analysis """ #Analysis of Review field stopword = set(stopwords.words('english')) text = " ".join(review for review in data.complete_review) wordcloud = WordCloud(stopwords=stopword).generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() #positive negative & neutral sentiment: positive = data[data['sentiment'] == 1] negative = data[data['sentiment'] == 0] #Positive Setiment stopword = set(stopwords.words('english')) text = " ".join(review for review in positive.complete_review) wordcloud = WordCloud(stopwords=stopword).generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() #Negative Setiment stopword = set(stopwords.words('english')) text = " ".join(review for review in negative.complete_review) wordcloud = WordCloud(stopwords=stopword).generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() """Model Bag of Word Vectorization """ #Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer token = RegexpTokenizer(r'[a-zA-Z0-9]+') cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize) X = cv.fit_transform(data['complete_review']) y = data['sentiment'] """Handle Imbalanced Data""" smote = SMOTE() X_oversampled, y_oversampled = smote.fit_resample(X, y) """Train Test Split""" X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.15, random_state=17,stratify=y_oversampled) """XGBoost""" dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) params = { 'objective': 'multi:softmax', 'num_class': 3, 'eval_metric': 'merror', 'eta': 0.4, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42 } num_rounds = 100 model = xgb.train(params, dtrain, num_rounds) preds = model.predict(dtest) pred_labels = [int(pred) for pred in preds] print(classification_report(pred_labels, y_test)) def predict(kata): preprocessed_kata = preprocess_data(kata) cv_fit = cv.fit(data['complete_review']) X_pred = cv_fit.transform(pd.Series([preprocessed_kata])) dmatrix = xgb.DMatrix(X_pred) preds = model.predict(dmatrix) return preds[0]