Spaces:

rxxnzz
/

chatgptreviews

Running

App Files Files Community

rxxnzz commited on Jun 18

Commit

07bb506

•

1 Parent(s): 3615d3f

Upload 2 files

Browse files

Files changed (2) hide show

chatgpt_reviews.csv +0 -0
tubes_tm.py +200 -0

chatgpt_reviews.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

tubes_tm.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import spacy
+import json,os,uuid
+import re
+import nltk
+from nltk.corpus import stopwords
+from wordcloud import WordCloud, STOPWORDS
+from sklearn.feature_extraction.text import CountVectorizer
+from nltk.tokenize import RegexpTokenizer
+from imblearn.over_sampling import SMOTE
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score,classification_report
+import xgboost as xgb
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import MultinomialNB
+from wordcloud import WordCloud, STOPWORDS
+import matplotlib.pyplot as plt
+from PIL import Image
+import warnings
+warnings.filterwarnings('ignore')
+nltk.download('stopwords')
+nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
+data = pd.read_csv('chatgpt_reviews.csv')
+data.head()
+data.info()
+data.describe()
+data.describe(include='object')
+"""<h3> Analysis of Rating column </h3>"""
+data['rating'].value_counts().sort_index()
+data['rating'].value_counts(normalize=True).mul(100).round(2).sort_index()
+#Plot
+palette = "deep"
+sns.set_palette(palette)
+sns.countplot(data=data, x='rating')
+plt.xlabel('Rating')
+plt.ylabel('No. of Users')
+plt.title('Ratings Distribution')
+plt.show()
+"""Preprocessing"""
+#Find no. of missing values in each column
+data.isnull().sum().sort_values(ascending=False)
+#Combine Review Time and Review
+data['complete_review'] = data['title'] +' .'+data['review']
+data = data.drop(['date','review','title'],axis='columns')
+data.head()
+def preprocess_data(text):
+    emoji_pattern = re.compile("["
+                               u"\U0001F600-\U0001F64F"
+                               u"\U0001F300-\U0001F5FF"
+                               u"\U0001F680-\U0001F6FF"
+                               u"\U0001F1E0-\U0001F1FF"
+                               u"\U00002702-\U000027B0"
+                               u"\U000024C2-\U0001F251"
+                               "]+", flags=re.UNICODE)
+    special_char_removal = re.compile(r'[^a-z\d\s]+', re.IGNORECASE)
+    text = text.lower()
+    text = emoji_pattern.sub('', text)
+    text = special_char_removal.sub('', text)
+    return text
+data['complete_review'] = data['complete_review'].apply(lambda x: preprocess_data(x))
+data['complete_review'].head()
+preprocess_data("Hallo, My name")
+"""hapus stopwords"""
+stop = stopwords.words('english')
+data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
+"""Lemmatization"""
+def space(comment):
+    doc = nlp(comment)
+    return " ".join([token.lemma_ for token in doc])
+data['complete_review']= data['complete_review'].apply(space)
+"""menghapus spesifik kata"""
+words_to_remove = ['chatgpt','app','chatgpts','chat','gpt','iphone','ipad','gpt4','phone','number','ai','use','io']
+data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove))
+data['sentiment'] = data['rating'].apply(lambda rating: 1 if rating > 3 else 0)
+data.head(5)
+data['sentiment'].value_counts(normalize=True).mul(100).round(2)
+"""Data is Imbalanced as about 66% of sentiment is positive, 24% is negative and 9.5% is neutral.
+# Reviews Analysis
+"""
+#Analysis of Review field
+stopword = set(stopwords.words('english'))
+text = " ".join(review for review in data.complete_review)
+wordcloud = WordCloud(stopwords=stopword).generate(text)
+plt.imshow(wordcloud, interpolation='bilinear')
+plt.axis("off")
+plt.show()
+#positive negative & neutral sentiment:
+positive = data[data['sentiment'] == 1]
+negative = data[data['sentiment'] == 0]
+#Positive Setiment
+stopword = set(stopwords.words('english'))
+text = " ".join(review for review in positive.complete_review)
+wordcloud = WordCloud(stopwords=stopword).generate(text)
+plt.imshow(wordcloud, interpolation='bilinear')
+plt.axis("off")
+plt.show()
+#Negative Setiment
+stopword = set(stopwords.words('english'))
+text = " ".join(review for review in negative.complete_review)
+wordcloud = WordCloud(stopwords=stopword).generate(text)
+plt.imshow(wordcloud, interpolation='bilinear')
+plt.axis("off")
+plt.show()
+"""Model
+Bag of Word Vectorization
+"""
+#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
+token = RegexpTokenizer(r'[a-zA-Z0-9]+')
+cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
+X = cv.fit_transform(data['complete_review'])
+y = data['sentiment']
+"""Handle Imbalanced Data"""
+smote = SMOTE()
+X_oversampled, y_oversampled = smote.fit_resample(X, y)
+"""Train Test Split"""
+X_train, X_test, y_train, y_test = train_test_split(X_oversampled,
+                                                  y_oversampled,
+                                                  test_size=0.15,
+                                                  random_state=17,stratify=y_oversampled)
+"""XGBoost"""
+dtrain = xgb.DMatrix(X_train, label=y_train)
+dtest = xgb.DMatrix(X_test, label=y_test)
+params = {
+    'objective': 'multi:softmax',
+    'num_class': 3,
+    'eval_metric': 'merror',
+    'eta': 0.4,
+    'max_depth': 6,
+    'subsample': 0.8,
+    'colsample_bytree': 0.8,
+    'seed': 42
+}
+num_rounds = 100
+model = xgb.train(params, dtrain, num_rounds)
+preds = model.predict(dtest)
+pred_labels = [int(pred) for pred in preds]
+print(classification_report(pred_labels, y_test))
+def predict(kata):
+    preprocessed_kata = preprocess_data(kata)
+    cv_fit = cv.fit(data['complete_review'])
+    X_pred = cv_fit.transform(pd.Series([preprocessed_kata]))
+    dmatrix = xgb.DMatrix(X_pred)
+    preds = model.predict(dmatrix)
+    return preds[0]