Spaces:

rxxnzz
/

chatgptreviews

Sleeping

App Files Files Community

chatgptreviews / tubes.py

rxxnzz

Rename main.py to tubes.py

1787442 verified 4 months ago

raw

history blame

No virus

5.73 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	import spacy
	import json,os,uuid
	import re
	import nltk
	from nltk.corpus import stopwords
	from wordcloud import WordCloud, STOPWORDS
	from sklearn.feature_extraction.text import CountVectorizer
	from nltk.tokenize import RegexpTokenizer

	from imblearn.over_sampling import SMOTE
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score,classification_report
	import xgboost as xgb
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.naive_bayes import MultinomialNB

	from wordcloud import WordCloud, STOPWORDS
	import matplotlib.pyplot as plt
	from PIL import Image

	import warnings
	warnings.filterwarnings('ignore')
	nltk.download('stopwords')
	nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

	data = pd.read_csv('chatgpt_reviews.csv')

	data.head()

	data.info()

	data.describe()

	data.describe(include='object')

	"""<h3> Analysis of Rating column </h3>"""

	data['rating'].value_counts().sort_index()

	data['rating'].value_counts(normalize=True).mul(100).round(2).sort_index()

	#Plot
	palette = "deep"
	sns.set_palette(palette)

	sns.countplot(data=data, x='rating')

	plt.xlabel('Rating')
	plt.ylabel('No. of Users')
	plt.title('Ratings Distribution')

	plt.show()

	"""Preprocessing"""

	#Find no. of missing values in each column
	data.isnull().sum().sort_values(ascending=False)

	#Combine Review Time and Review
	data['complete_review'] = data['title'] +' .'+data['review']
	data = data.drop(['date','review','title'],axis='columns')

	data.head()

	def preprocess_data(text):
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F"
	u"\U0001F300-\U0001F5FF"
	u"\U0001F680-\U0001F6FF"
	u"\U0001F1E0-\U0001F1FF"
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)
	special_char_removal = re.compile(r'[^a-z\d\s]+', re.IGNORECASE)
	text = text.lower()
	text = emoji_pattern.sub('', text)
	text = special_char_removal.sub('', text)
	return text
	data['complete_review'] = data['complete_review'].apply(lambda x: preprocess_data(x))
	data['complete_review'].head()

	preprocess_data("Hallo, My name")

	"""hapus stopwords"""

	stop = stopwords.words('english')
	data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

	"""Lemmatization"""

	def space(comment):
	doc = nlp(comment)
	return " ".join([token.lemma_ for token in doc])
	data['complete_review']= data['complete_review'].apply(space)

	"""menghapus spesifik kata"""

	words_to_remove = ['chatgpt','app','chatgpts','chat','gpt','iphone','ipad','gpt4','phone','number','ai','use','io']
	data['complete_review'] = data['complete_review'].apply(lambda x: " ".join(x for x in x.split() if x not in words_to_remove))

	data['sentiment'] = data['rating'].apply(lambda rating: 1 if rating > 3 else 0)

	data.head(5)

	data['sentiment'].value_counts(normalize=True).mul(100).round(2)

	"""Data is Imbalanced as about 66% of sentiment is positive, 24% is negative and 9.5% is neutral.

	# Reviews Analysis
	"""

	#Analysis of Review field
	stopword = set(stopwords.words('english'))
	text = " ".join(review for review in data.complete_review)
	wordcloud = WordCloud(stopwords=stopword).generate(text)
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.show()

	#positive negative & neutral sentiment:
	positive = data[data['sentiment'] == 1]
	negative = data[data['sentiment'] == 0]

	#Positive Setiment
	stopword = set(stopwords.words('english'))
	text = " ".join(review for review in positive.complete_review)
	wordcloud = WordCloud(stopwords=stopword).generate(text)
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.show()

	#Negative Setiment
	stopword = set(stopwords.words('english'))
	text = " ".join(review for review in negative.complete_review)
	wordcloud = WordCloud(stopwords=stopword).generate(text)
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.show()

	"""Model

	Bag of Word Vectorization
	"""

	#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
	token = RegexpTokenizer(r'[a-zA-Z0-9]+')
	cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
	X = cv.fit_transform(data['complete_review'])
	y = data['sentiment']

	"""Handle Imbalanced Data"""

	smote = SMOTE()
	X_oversampled, y_oversampled = smote.fit_resample(X, y)

	"""Train Test Split"""

	X_train, X_test, y_train, y_test = train_test_split(X_oversampled,
	y_oversampled,
	test_size=0.15,
	random_state=17,stratify=y_oversampled)

	"""XGBoost"""

	dtrain = xgb.DMatrix(X_train, label=y_train)
	dtest = xgb.DMatrix(X_test, label=y_test)


	params = {
	'objective': 'multi:softmax',
	'num_class': 3,
	'eval_metric': 'merror',
	'eta': 0.4,
	'max_depth': 6,
	'subsample': 0.8,
	'colsample_bytree': 0.8,
	'seed': 42
	}

	num_rounds = 100
	model = xgb.train(params, dtrain, num_rounds)

	preds = model.predict(dtest)
	pred_labels = [int(pred) for pred in preds]

	print(classification_report(pred_labels, y_test))

	def predict(kata):
	preprocessed_kata = preprocess_data(kata)
	cv_fit = cv.fit(data['complete_review'])
	X_pred = cv_fit.transform(pd.Series([preprocessed_kata]))
	dmatrix = xgb.DMatrix(X_pred)
	preds = model.predict(dmatrix)
	return preds[0]