rxxnzz commited on
Commit
07bb506
1 Parent(s): 3615d3f

Upload 2 files

Browse files
Files changed (2) hide show
  1. chatgpt_reviews.csv +0 -0
  2. tubes_tm.py +200 -0
chatgpt_reviews.csv ADDED
The diff for this file is too large to render. See raw diff
 
tubes_tm.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+
6
+ import spacy
7
+ import json,os,uuid
8
+ import re
9
+ import nltk
10
+ from nltk.corpus import stopwords
11
+ from wordcloud import WordCloud, STOPWORDS
12
+ from sklearn.feature_extraction.text import CountVectorizer
13
+ from nltk.tokenize import RegexpTokenizer
14
+
15
+ from imblearn.over_sampling import SMOTE
16
+ from sklearn.model_selection import train_test_split
17
+ from sklearn.metrics import accuracy_score,classification_report
18
+ import xgboost as xgb
19
+ from sklearn.ensemble import RandomForestClassifier
20
+ from sklearn.linear_model import LogisticRegression
21
+ from sklearn.naive_bayes import MultinomialNB
22
+
23
+ from wordcloud import WordCloud, STOPWORDS
24
+ import matplotlib.pyplot as plt
25
+ from PIL import Image
26
+
27
+ import warnings
28
+ warnings.filterwarnings('ignore')
29
+ nltk.download('stopwords')
30
+ nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
31
+
32
+ data = pd.read_csv('chatgpt_reviews.csv')
33
+
34
+ data.head()
35
+
36
+ data.info()
37
+
38
+ data.describe()
39
+
40
+ data.describe(include='object')
41
+
42
+ """<h3> Analysis of Rating column </h3>"""
43
+
44
+ data['rating'].value_counts().sort_index()
45
+
46
+ data['rating'].value_counts(normalize=True).mul(100).round(2).sort_index()
47
+
48
+ #Plot
49
+ palette = "deep"
50
+ sns.set_palette(palette)
51
+
52
+ sns.countplot(data=data, x='rating')
53
+
54
+ plt.xlabel('Rating')
55
+ plt.ylabel('No. of Users')
56
+ plt.title('Ratings Distribution')
57
+
58
+ plt.show()
59
+
60
+ """Preprocessing"""
61
+
62
+ #Find no. of missing values in each column
63
+ data.isnull().sum().sort_values(ascending=False)
64
+
65
+ #Combine Review Time and Review
66
+ data['complete_review'] = data['title'] +' .'+data['review']
67
+ data = data.drop(['date','review','title'],axis='columns')
68
+
69
+ data.head()
70
+
71
+ def preprocess_data(text):
72
+ emoji_pattern = re.compile("["
73
+ u"\U0001F600-\U0001F64F"
74
+ u"\U0001F300-\U0001F5FF"
75
+ u"\U0001F680-\U0001F6FF"
76
+ u"\U0001F1E0-\U0001F1FF"
77
+ u"\U00002702-\U000027B0"
78
+ u"\U000024C2-\U0001F251"
79
+ "]+", flags=re.UNICODE)
80
+ special_char_removal = re.compile(r'[^a-z\d\s]+', re.IGNORECASE)
81
+ text = text.lower()
82
+ text = emoji_pattern.sub('', text)
83
+ text = special_char_removal.sub('', text)
84
+ return text
85
+ data['complete_review'] = data['complete_review'].apply(lambda x: preprocess_data(x))
86
+ data['complete_review'].head()
87
+
88
+ preprocess_data("Hallo, My name")
89
+
90
+ """hapus stopwords"""
91
+
92
+ stop = stopwords.words('english')
93
+ data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
94
+
95
+ """Lemmatization"""
96
+
97
+ def space(comment):
98
+ doc = nlp(comment)
99
+ return " ".join([token.lemma_ for token in doc])
100
+ data['complete_review']= data['complete_review'].apply(space)
101
+
102
+ """menghapus spesifik kata"""
103
+
104
+ words_to_remove = ['chatgpt','app','chatgpts','chat','gpt','iphone','ipad','gpt4','phone','number','ai','use','io']
105
+ data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove))
106
+
107
+ data['sentiment'] = data['rating'].apply(lambda rating: 1 if rating > 3 else 0)
108
+
109
+ data.head(5)
110
+
111
+ data['sentiment'].value_counts(normalize=True).mul(100).round(2)
112
+
113
+ """Data is Imbalanced as about 66% of sentiment is positive, 24% is negative and 9.5% is neutral.
114
+
115
+ # Reviews Analysis
116
+ """
117
+
118
+ #Analysis of Review field
119
+ stopword = set(stopwords.words('english'))
120
+ text = " ".join(review for review in data.complete_review)
121
+ wordcloud = WordCloud(stopwords=stopword).generate(text)
122
+ plt.imshow(wordcloud, interpolation='bilinear')
123
+ plt.axis("off")
124
+ plt.show()
125
+
126
+ #positive negative & neutral sentiment:
127
+ positive = data[data['sentiment'] == 1]
128
+ negative = data[data['sentiment'] == 0]
129
+
130
+ #Positive Setiment
131
+ stopword = set(stopwords.words('english'))
132
+ text = " ".join(review for review in positive.complete_review)
133
+ wordcloud = WordCloud(stopwords=stopword).generate(text)
134
+ plt.imshow(wordcloud, interpolation='bilinear')
135
+ plt.axis("off")
136
+ plt.show()
137
+
138
+ #Negative Setiment
139
+ stopword = set(stopwords.words('english'))
140
+ text = " ".join(review for review in negative.complete_review)
141
+ wordcloud = WordCloud(stopwords=stopword).generate(text)
142
+ plt.imshow(wordcloud, interpolation='bilinear')
143
+ plt.axis("off")
144
+ plt.show()
145
+
146
+ """Model
147
+
148
+ Bag of Word Vectorization
149
+ """
150
+
151
+ #Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
152
+ token = RegexpTokenizer(r'[a-zA-Z0-9]+')
153
+ cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
154
+ X = cv.fit_transform(data['complete_review'])
155
+ y = data['sentiment']
156
+
157
+ """Handle Imbalanced Data"""
158
+
159
+ smote = SMOTE()
160
+ X_oversampled, y_oversampled = smote.fit_resample(X, y)
161
+
162
+ """Train Test Split"""
163
+
164
+ X_train, X_test, y_train, y_test = train_test_split(X_oversampled,
165
+ y_oversampled,
166
+ test_size=0.15,
167
+ random_state=17,stratify=y_oversampled)
168
+
169
+ """XGBoost"""
170
+
171
+ dtrain = xgb.DMatrix(X_train, label=y_train)
172
+ dtest = xgb.DMatrix(X_test, label=y_test)
173
+
174
+
175
+ params = {
176
+ 'objective': 'multi:softmax',
177
+ 'num_class': 3,
178
+ 'eval_metric': 'merror',
179
+ 'eta': 0.4,
180
+ 'max_depth': 6,
181
+ 'subsample': 0.8,
182
+ 'colsample_bytree': 0.8,
183
+ 'seed': 42
184
+ }
185
+
186
+ num_rounds = 100
187
+ model = xgb.train(params, dtrain, num_rounds)
188
+
189
+ preds = model.predict(dtest)
190
+ pred_labels = [int(pred) for pred in preds]
191
+
192
+ print(classification_report(pred_labels, y_test))
193
+
194
+ def predict(kata):
195
+ preprocessed_kata = preprocess_data(kata)
196
+ cv_fit = cv.fit(data['complete_review'])
197
+ X_pred = cv_fit.transform(pd.Series([preprocessed_kata]))
198
+ dmatrix = xgb.DMatrix(X_pred)
199
+ preds = model.predict(dmatrix)
200
+ return preds[0]