import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import spacy | |
import json,os,uuid | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from wordcloud import WordCloud, STOPWORDS | |
from sklearn.feature_extraction.text import CountVectorizer | |
from nltk.tokenize import RegexpTokenizer | |
from imblearn.over_sampling import SMOTE | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score,classification_report | |
import xgboost as xgb | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.naive_bayes import MultinomialNB | |
from PIL import Image | |
import warnings | |
warnings.filterwarnings('ignore') | |'stopwords') | |
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner']) | |
data = pd.read_csv('chatgpt_reviews.csv') | |
data.head() | | | |
data.describe() | |
data.describe(include='object') | |
"""<h3> Analysis of Rating column </h3>""" | |
data['rating'].value_counts().sort_index() | |
data['rating'].value_counts(normalize=True).mul(100).round(2).sort_index() | |
#Plot | |
palette = "deep" | |
sns.set_palette(palette) | |
sns.countplot(data=data, x='rating') | |
plt.xlabel('Rating') | |
plt.ylabel('No. of Users') | |
plt.title('Ratings Distribution') | | | |
"""Preprocessing""" | |
#Find no. of missing values in each column | |
data.isnull().sum().sort_values(ascending=False) | |
#Combine Review Time and Review | |
data['complete_review'] = data['title'] +' .'+data['review'] | |
data = data.drop(['date','review','title'],axis='columns') | |
data.head() | |
def preprocess_data(text): | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" | |
u"\U0001F300-\U0001F5FF" | |
u"\U0001F680-\U0001F6FF" | |
u"\U0001F1E0-\U0001F1FF" | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE) | |
special_char_removal = re.compile(r'[^a-z\d\s]+', re.IGNORECASE) | |
text = text.lower() | |
text = emoji_pattern.sub('', text) | |
text = special_char_removal.sub('', text) | |
return text | |
data['complete_review'] = data['complete_review'].apply(lambda x: preprocess_data(x)) | |
data['complete_review'].head() | |
preprocess_data("Hallo, My name") | |
"""hapus stopwords""" | |
stop = stopwords.words('english') | |
data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) | |
"""Lemmatization""" | |
def space(comment): | |
doc = nlp(comment) | |
return " ".join([token.lemma_ for token in doc]) | |
data['complete_review']= data['complete_review'].apply(space) | |
"""menghapus spesifik kata""" | |
words_to_remove = ['chatgpt','app','chatgpts','chat','gpt','iphone','ipad','gpt4','phone','number','ai','use','io'] | |
data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove)) | |
data['sentiment'] = data['rating'].apply(lambda rating: 1 if rating > 3 else 0) | |
data.head(5) | |
data['sentiment'].value_counts(normalize=True).mul(100).round(2) | |
"""Data is Imbalanced as about 66% of sentiment is positive, 24% is negative and 9.5% is neutral. | |
# Reviews Analysis | |
""" | |
#Analysis of Review field | |
stopword = set(stopwords.words('english')) | |
text = " ".join(review for review in data.complete_review) | |
wordcloud = WordCloud(stopwords=stopword).generate(text) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | | | |
#positive negative & neutral sentiment: | |
positive = data[data['sentiment'] == 1] | |
negative = data[data['sentiment'] == 0] | |
#Positive Setiment | |
stopword = set(stopwords.words('english')) | |
text = " ".join(review for review in positive.complete_review) | |
wordcloud = WordCloud(stopwords=stopword).generate(text) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | | | |
#Negative Setiment | |
stopword = set(stopwords.words('english')) | |
text = " ".join(review for review in negative.complete_review) | |
wordcloud = WordCloud(stopwords=stopword).generate(text) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis("off") | | | |
"""Model | |
Bag of Word Vectorization | |
""" | |
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer | |
token = RegexpTokenizer(r'[a-zA-Z0-9]+') | |
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize) | |
X = cv.fit_transform(data['complete_review']) | |
y = data['sentiment'] | |
"""Handle Imbalanced Data""" | |
smote = SMOTE() | |
X_oversampled, y_oversampled = smote.fit_resample(X, y) | |
"""Train Test Split""" | |
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, | |
y_oversampled, | |
test_size=0.15, | |
random_state=17,stratify=y_oversampled) | |
"""XGBoost""" | |
dtrain = xgb.DMatrix(X_train, label=y_train) | |
dtest = xgb.DMatrix(X_test, label=y_test) | |
params = { | |
'objective': 'multi:softmax', | |
'num_class': 3, | |
'eval_metric': 'merror', | |
'eta': 0.4, | |
'max_depth': 6, | |
'subsample': 0.8, | |
'colsample_bytree': 0.8, | |
'seed': 42 | |
} | |
num_rounds = 100 | |
model = xgb.train(params, dtrain, num_rounds) | |
preds = model.predict(dtest) | |
pred_labels = [int(pred) for pred in preds] | |
print(classification_report(pred_labels, y_test)) | |
def predict(kata): | |
preprocessed_kata = preprocess_data(kata) | |
cv_fit =['complete_review']) | |
X_pred = cv_fit.transform(pd.Series([preprocessed_kata])) | |
dmatrix = xgb.DMatrix(X_pred) | |
preds = model.predict(dmatrix) | |
return preds[0] | |