Spaces:
Running
Running
Upload 2 files
Browse files- chatgpt_reviews.csv +0 -0
- tubes_tm.py +200 -0
chatgpt_reviews.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tubes_tm.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
|
6 |
+
import spacy
|
7 |
+
import json,os,uuid
|
8 |
+
import re
|
9 |
+
import nltk
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from wordcloud import WordCloud, STOPWORDS
|
12 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
13 |
+
from nltk.tokenize import RegexpTokenizer
|
14 |
+
|
15 |
+
from imblearn.over_sampling import SMOTE
|
16 |
+
from sklearn.model_selection import train_test_split
|
17 |
+
from sklearn.metrics import accuracy_score,classification_report
|
18 |
+
import xgboost as xgb
|
19 |
+
from sklearn.ensemble import RandomForestClassifier
|
20 |
+
from sklearn.linear_model import LogisticRegression
|
21 |
+
from sklearn.naive_bayes import MultinomialNB
|
22 |
+
|
23 |
+
from wordcloud import WordCloud, STOPWORDS
|
24 |
+
import matplotlib.pyplot as plt
|
25 |
+
from PIL import Image
|
26 |
+
|
27 |
+
import warnings
|
28 |
+
warnings.filterwarnings('ignore')
|
29 |
+
nltk.download('stopwords')
|
30 |
+
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
|
31 |
+
|
32 |
+
data = pd.read_csv('chatgpt_reviews.csv')
|
33 |
+
|
34 |
+
data.head()
|
35 |
+
|
36 |
+
data.info()
|
37 |
+
|
38 |
+
data.describe()
|
39 |
+
|
40 |
+
data.describe(include='object')
|
41 |
+
|
42 |
+
"""<h3> Analysis of Rating column </h3>"""
|
43 |
+
|
44 |
+
data['rating'].value_counts().sort_index()
|
45 |
+
|
46 |
+
data['rating'].value_counts(normalize=True).mul(100).round(2).sort_index()
|
47 |
+
|
48 |
+
#Plot
|
49 |
+
palette = "deep"
|
50 |
+
sns.set_palette(palette)
|
51 |
+
|
52 |
+
sns.countplot(data=data, x='rating')
|
53 |
+
|
54 |
+
plt.xlabel('Rating')
|
55 |
+
plt.ylabel('No. of Users')
|
56 |
+
plt.title('Ratings Distribution')
|
57 |
+
|
58 |
+
plt.show()
|
59 |
+
|
60 |
+
"""Preprocessing"""
|
61 |
+
|
62 |
+
#Find no. of missing values in each column
|
63 |
+
data.isnull().sum().sort_values(ascending=False)
|
64 |
+
|
65 |
+
#Combine Review Time and Review
|
66 |
+
data['complete_review'] = data['title'] +' .'+data['review']
|
67 |
+
data = data.drop(['date','review','title'],axis='columns')
|
68 |
+
|
69 |
+
data.head()
|
70 |
+
|
71 |
+
def preprocess_data(text):
|
72 |
+
emoji_pattern = re.compile("["
|
73 |
+
u"\U0001F600-\U0001F64F"
|
74 |
+
u"\U0001F300-\U0001F5FF"
|
75 |
+
u"\U0001F680-\U0001F6FF"
|
76 |
+
u"\U0001F1E0-\U0001F1FF"
|
77 |
+
u"\U00002702-\U000027B0"
|
78 |
+
u"\U000024C2-\U0001F251"
|
79 |
+
"]+", flags=re.UNICODE)
|
80 |
+
special_char_removal = re.compile(r'[^a-z\d\s]+', re.IGNORECASE)
|
81 |
+
text = text.lower()
|
82 |
+
text = emoji_pattern.sub('', text)
|
83 |
+
text = special_char_removal.sub('', text)
|
84 |
+
return text
|
85 |
+
data['complete_review'] = data['complete_review'].apply(lambda x: preprocess_data(x))
|
86 |
+
data['complete_review'].head()
|
87 |
+
|
88 |
+
preprocess_data("Hallo, My name")
|
89 |
+
|
90 |
+
"""hapus stopwords"""
|
91 |
+
|
92 |
+
stop = stopwords.words('english')
|
93 |
+
data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
|
94 |
+
|
95 |
+
"""Lemmatization"""
|
96 |
+
|
97 |
+
def space(comment):
|
98 |
+
doc = nlp(comment)
|
99 |
+
return " ".join([token.lemma_ for token in doc])
|
100 |
+
data['complete_review']= data['complete_review'].apply(space)
|
101 |
+
|
102 |
+
"""menghapus spesifik kata"""
|
103 |
+
|
104 |
+
words_to_remove = ['chatgpt','app','chatgpts','chat','gpt','iphone','ipad','gpt4','phone','number','ai','use','io']
|
105 |
+
data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove))
|
106 |
+
|
107 |
+
data['sentiment'] = data['rating'].apply(lambda rating: 1 if rating > 3 else 0)
|
108 |
+
|
109 |
+
data.head(5)
|
110 |
+
|
111 |
+
data['sentiment'].value_counts(normalize=True).mul(100).round(2)
|
112 |
+
|
113 |
+
"""Data is Imbalanced as about 66% of sentiment is positive, 24% is negative and 9.5% is neutral.
|
114 |
+
|
115 |
+
# Reviews Analysis
|
116 |
+
"""
|
117 |
+
|
118 |
+
#Analysis of Review field
|
119 |
+
stopword = set(stopwords.words('english'))
|
120 |
+
text = " ".join(review for review in data.complete_review)
|
121 |
+
wordcloud = WordCloud(stopwords=stopword).generate(text)
|
122 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
123 |
+
plt.axis("off")
|
124 |
+
plt.show()
|
125 |
+
|
126 |
+
#positive negative & neutral sentiment:
|
127 |
+
positive = data[data['sentiment'] == 1]
|
128 |
+
negative = data[data['sentiment'] == 0]
|
129 |
+
|
130 |
+
#Positive Setiment
|
131 |
+
stopword = set(stopwords.words('english'))
|
132 |
+
text = " ".join(review for review in positive.complete_review)
|
133 |
+
wordcloud = WordCloud(stopwords=stopword).generate(text)
|
134 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
135 |
+
plt.axis("off")
|
136 |
+
plt.show()
|
137 |
+
|
138 |
+
#Negative Setiment
|
139 |
+
stopword = set(stopwords.words('english'))
|
140 |
+
text = " ".join(review for review in negative.complete_review)
|
141 |
+
wordcloud = WordCloud(stopwords=stopword).generate(text)
|
142 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
143 |
+
plt.axis("off")
|
144 |
+
plt.show()
|
145 |
+
|
146 |
+
"""Model
|
147 |
+
|
148 |
+
Bag of Word Vectorization
|
149 |
+
"""
|
150 |
+
|
151 |
+
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
|
152 |
+
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
|
153 |
+
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
|
154 |
+
X = cv.fit_transform(data['complete_review'])
|
155 |
+
y = data['sentiment']
|
156 |
+
|
157 |
+
"""Handle Imbalanced Data"""
|
158 |
+
|
159 |
+
smote = SMOTE()
|
160 |
+
X_oversampled, y_oversampled = smote.fit_resample(X, y)
|
161 |
+
|
162 |
+
"""Train Test Split"""
|
163 |
+
|
164 |
+
X_train, X_test, y_train, y_test = train_test_split(X_oversampled,
|
165 |
+
y_oversampled,
|
166 |
+
test_size=0.15,
|
167 |
+
random_state=17,stratify=y_oversampled)
|
168 |
+
|
169 |
+
"""XGBoost"""
|
170 |
+
|
171 |
+
dtrain = xgb.DMatrix(X_train, label=y_train)
|
172 |
+
dtest = xgb.DMatrix(X_test, label=y_test)
|
173 |
+
|
174 |
+
|
175 |
+
params = {
|
176 |
+
'objective': 'multi:softmax',
|
177 |
+
'num_class': 3,
|
178 |
+
'eval_metric': 'merror',
|
179 |
+
'eta': 0.4,
|
180 |
+
'max_depth': 6,
|
181 |
+
'subsample': 0.8,
|
182 |
+
'colsample_bytree': 0.8,
|
183 |
+
'seed': 42
|
184 |
+
}
|
185 |
+
|
186 |
+
num_rounds = 100
|
187 |
+
model = xgb.train(params, dtrain, num_rounds)
|
188 |
+
|
189 |
+
preds = model.predict(dtest)
|
190 |
+
pred_labels = [int(pred) for pred in preds]
|
191 |
+
|
192 |
+
print(classification_report(pred_labels, y_test))
|
193 |
+
|
194 |
+
def predict(kata):
|
195 |
+
preprocessed_kata = preprocess_data(kata)
|
196 |
+
cv_fit = cv.fit(data['complete_review'])
|
197 |
+
X_pred = cv_fit.transform(pd.Series([preprocessed_kata]))
|
198 |
+
dmatrix = xgb.DMatrix(X_pred)
|
199 |
+
preds = model.predict(dmatrix)
|
200 |
+
return preds[0]
|