JoanWaweru
/

Code-SwitchedSentimentAnalysis

Model card Files Files and versions Community

Code-SwitchedSentimentAnalysis / safaricomproject.py

JoanWaweru

Upload 7 files

48f3dfc almost 2 years ago

raw

history blame

No virus

10.3 kB

	# -- coding: utf-8 --
	"""SafaricomProject.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1Q0IBBWS6EJsk7j1mGoRghqpR-dePQ3yi

	"""
	import pip

	# pip install pandas

	import numpy as np
	import pandas as pd
	from pip._internal.operations.install.legacy import install

	# Read csv file into a pandas dataframe
	# from google.colab import files
	# uploaded = files.upload()
	import emoji
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import string
	import matplotlib.pyplot as plt
	import re
	#from wordcloud import WordCloud
	from collections import Counter
	from sklearn.cluster import KMeans
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import TfidfVectorizer
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	from sklearn.model_selection import train_test_split
	from sklearn.svm import LinearSVC
	from sklearn.linear_model import LogisticRegression
	from sklearn.naive_bayes import MultinomialNB

	# Reading Dataset

	df = pd.read_csv('safaricomDataset.csv')
	df.head()

	df.columns

	df.shape

	tweets_df = df[["Date", "User", "Tweet"]]
	tweets_df.head()

	# from sklearn import utils
	tweets_df.shape

	"""#Preprocessing and Cleaning of the Dataset """

	nltk.download('punkt')


	# pip install emoji

	# import re

	# import emoji


	def tokenize_tweets(text):
	# remove emojis
	text = emoji.demojize(text)
	# remove urls
	text = re.sub('http[s]?://\S+', '', text)
	# remove punctuations
	text = re.sub(r'[^\w\s]', '', text)
	# strip numbers
	text = re.sub('[0-9]+', '', text)
	text = word_tokenize(text)

	return text


	tweets_df["Tweets"] = tweets_df["Tweet"].apply(lambda x: tokenize_tweets(x))


	nltk.download('stopwords')
	stop = stopwords.words("english")
	tweets_df["stop_words"] = tweets_df["Tweets"].apply(lambda x: [w for w in x if w in stop])
	tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [w.lower() for w in x if w not in stop])

	tweets_df.head(10)

	tweets_df.head()


	string.punctuation

	from nltk.stem.porter import *

	stemmer = PorterStemmer()
	tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [stemmer.stem(w) for w in x])
	tweets_df.head()


	def remove_punct(text):
	text = " ".join([char for char in text if char not in string.punctuation])
	text = re.sub('[0-9]+', '', text)

	return text


	tweets_df['tweet_punct'] = tweets_df['Tweets'].apply(lambda x: remove_punct(x))

	tweets_df.head()

	"""#Data Visualization(Word Cloud)"""

	#all_words = ' '.join([text for text in df['Tweet']])

	#wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

	#plt.figure(figsize=(10, 7))
	#plt.imshow(wordcloud, interpolation="bilinear")
	#plt.axis('off')
	#plt.show()

	"""#Get the most frequent words"""

	cnt = Counter()
	for text in df["Tweet"].values:
	for word in text.split():
	cnt[word] += 1

	cnt.most_common(20)

	"""#Using Vader Library to analyse sentiments in Text"""

	# !pip install vaderSentiment

	"""#Training of Dataset"""

	analyzer = SentimentIntensityAnalyzer()

	"""#Getting the sentiments label"""


	def sentiment_score_compound(sentence):
	score = analyzer.polarity_scores(sentence)
	return score['compound']


	def sentiment_score_pos(sentence):
	score = analyzer.polarity_scores(sentence)
	return score['pos']


	def sentiment_score_neg(sentence):
	score = analyzer.polarity_scores(sentence)
	return score['neg']


	def sentiment_score_neu(sentence):
	score = analyzer.polarity_scores(sentence)
	return score['neu']


	tweets_df["tweets_sent_compound"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_compound(x))
	tweets_df["tweets_sent_pos"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_pos(x))
	tweets_df["tweets_sent_neg"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_neg(x))
	tweets_df.head()

	tweets_df.tail()

	#wordlist = nltk.FreqDist(all_words)
	#word_features = wordlist.keys()

	"""#Vectorization"""

	cv = CountVectorizer()
	tweets_list = []
	for tweet in tweets_df["tweet_punct"]:
	tweets_list.append(tweet)
	len(tweets_list)
	tfIdf = TfidfVectorizer(max_features=20000)

	X = tweets_df["tweet_punct"]

	vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True, use_idf=True, ngram_range=(1, 2))
	#len(all_words)

	"""#Define Labels(Positive, Negative, Neutral)"""


	# negative label is 0
	# neutral label is 1
	# positive label is 2

	def label_value(val):
	if val < 0:
	return 0
	elif val == 0:
	return 1
	else:
	return 2


	tweets_df["label"] = tweets_df["tweets_sent_compound"].apply(lambda x: label_value(x))
	tweets_df.head()

	cv = CountVectorizer(binary=True)
	cv.fit(tweets_list)
	X = cv.transform(tweets_list)
	y = tweets_df["label"].values

	"""#Plotting the Label Results"""

	# Commented out IPython magic to ensure Python compatibility.
	# %matplotlib inline
	plt.rcParams['figure.figsize'] = [10, 8]
	for index, Tweets in enumerate(df.index):
	x = tweets_df.tweets_sent_pos.loc[Tweets]
	y = tweets_df.tweets_sent_neg.loc[Tweets]
	plt.scatter(x, y, color='Blue')

	plt.title('Safaricom Tweets Sentiment Analysis', fontsize=20)
	plt.xlabel('← Negative — — — Neutral — — — Positive →', fontsize=15)
	plt.ylabel('← Facts — — — — — — — Opinions →', fontsize=15)
	plt.show()

	"""#Plotting on a Pie Chart and Bar Chart

	"""

	# Commented out IPython magic to ensure Python compatibility.

	# %matplotlib inline
	tweets_df['label'].value_counts().plot(kind='pie', autopct='%1.0f%%')
	plt.show()

	tweets_df['label'].value_counts().sort_index().plot.bar()
	plt.show()

	"""#Classification using SVM"""

	# encoder = preprocessing.LabelEncoder()
	# X = tfIdf.fit_transform(df['Text'])
	# y = df['tweets_sent_compound']
	# X.shape

	# X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=0)
	# encoder = preprocessing.LabelEncoder()
	# y_train = encoder.fit_transform(y_train)
	# y_test = encoder.fit_transform(y_test)

	# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.2, random_state = 0)

	epochs = 20
	for epoch in range(epochs):
	print(f'Epochs: {epoch + 1}')
	train_loss = 0
	valid_loss = 0

	ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
	ngram_vectorizer.fit(tweets_list)
	X = ngram_vectorizer.transform(tweets_list)
	y = tweets_df["label"].values
	X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
	svm = LinearSVC()
	svm.fit(X_train, y_train)

	# clf = LinearSVC()
	# clf.fit(X_train, y_train)

	pred = svm.predict(X_val)
	print("Accuracy: ", accuracy_score(y_val, pred))
	print(classification_report(y_val, pred))
	print(confusion_matrix(y_val, pred))

	"""#TF-IDF Vectroization"""

	for epoch in range(epochs):
	print(f'Epochs: {epoch + 1}')
	train_loss = 0
	valid_loss = 0

	tfidf_vectorizer = TfidfVectorizer()
	tfidf_vectorizer.fit(tweets_list)
	X = tfidf_vectorizer.transform(tweets_list)
	y = tweets_df["label"].values

	X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)

	svm = LinearSVC()
	svm.fit(X_train, y_train)
	pred = svm.predict(X_val)
	print("Accuracy: ", accuracy_score(y_val, pred))
	print(classification_report(y_val, pred))
	print(confusion_matrix(y_val, pred))
	#print(pred.predict([[0, 1, 2]]))

	"""#Classification using Logistic Regression"""

	lr = LogisticRegression()
	lr.fit(X_train, y_train)

	pred = lr.predict(X_val)
	print("Accuracy: ", accuracy_score(y_val, pred))
	print(classification_report(y_val, pred))
	print(confusion_matrix(y_val, pred))

	"""#Using TF-IDF Vectorization"""

	tfidf_vectorizer = TfidfVectorizer()
	tfidf_vectorizer.fit(tweets_list)
	X = tfidf_vectorizer.transform(tweets_list)
	y = tweets_df["label"].values

	X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)

	lr = LogisticRegression()
	lr.fit(X_train, y_train)

	pred = lr.predict(X_val)
	print("Accuracy:", accuracy_score(y_val, pred))
	print(classification_report(y_val, pred))
	print(confusion_matrix(y_val, pred))

	"""#Classification using Naives Bayes"""

	MNB = MultinomialNB()
	MNB.fit(X_train, y_train)
	pred = MNB.predict(X_val)
	print(accuracy_score(y_val, pred))
	print(classification_report(y_val, pred))
	print(confusion_matrix(y_val, pred))

	"""# TF-IDF Vectorization"""

	tfidf_vectorizer = TfidfVectorizer()
	tfidf_vectorizer.fit(tweets_list)
	X = tfidf_vectorizer.transform(tweets_list)
	y = tweets_df["label"].values

	X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
	MNB = MultinomialNB()
	MNB.fit(X_train, y_train)
	pred = MNB.predict(X_val)
	print("Accuracy: ", accuracy_score(y_val, pred))
	print(classification_report(y_val, pred))
	print(confusion_matrix(y_val, pred))

	#import numpy as np
	from flask import Flask, request, jsonify, render_template

	app = Flask(__name__)

	@app.route('/')
	def home():
	return render_template('index.html')

	@app.route('/predict',methods=['GET', 'POST'])
	def predict():
	'''
	For rendering results on HTML GUI
	'''
	#int_features = 'Safaricom is good'
	#final_features = [{'Tweet': int_features}]
	#dfPrediction = pd.DataFrame(final_features)

	#prediction = svm.predict(dfPrediction['Tweet'])

	#output = round(prediction[0], 2)

	# yg
	#return render_template('index.html', prediction_text='The tweet is {}'.format(output))

	if request.method == "POST":
	# getting input with name = lname in HTML form
	tweetPredict = request.form.get("tweet")
	prediction = svm.predict([str[np.array(tweetPredict)]])
	output = round(prediction[0], 2)
	#return "The tweet is " + tweetPredict
	return render_template("index.html", prediction_text='The tweet is {}'.format(output))

	@app.route('/predict_api',methods=['POST'])
	def predict_api():
	'''
	For direct API calls trought request
	'''
	data = request.get_json(force=True)
	prediction = svm.predict([np.array(list(data.values()))])

	output = prediction[0]
	return jsonify(output)

	if __name__ == "__main__":
	app.run(debug=True)