Code-SwitchedSentimentAnalysis / safaricomproject.py
JoanWaweru's picture
Upload 7 files
48f3dfc
# -*- coding: utf-8 -*-
"""SafaricomProject.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Q0IBBWS6EJsk7j1mGoRghqpR-dePQ3yi
"""
import pip
# pip install pandas
import numpy as np
import pandas as pd
from pip._internal.operations.install.legacy import install
# Read csv file into a pandas dataframe
# from google.colab import files
# uploaded = files.upload()
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import matplotlib.pyplot as plt
import re
#from wordcloud import WordCloud
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# Reading Dataset
df = pd.read_csv('safaricomDataset.csv')
df.head()
df.columns
df.shape
tweets_df = df[["Date", "User", "Tweet"]]
tweets_df.head()
# from sklearn import utils
tweets_df.shape
"""#Preprocessing and Cleaning of the Dataset """
nltk.download('punkt')
# pip install emoji
# import re
# import emoji
def tokenize_tweets(text):
# remove emojis
text = emoji.demojize(text)
# remove urls
text = re.sub('http[s]?://\S+', '', text)
# remove punctuations
text = re.sub(r'[^\w\s]', '', text)
# strip numbers
text = re.sub('[0-9]+', '', text)
text = word_tokenize(text)
return text
tweets_df["Tweets"] = tweets_df["Tweet"].apply(lambda x: tokenize_tweets(x))
nltk.download('stopwords')
stop = stopwords.words("english")
tweets_df["stop_words"] = tweets_df["Tweets"].apply(lambda x: [w for w in x if w in stop])
tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [w.lower() for w in x if w not in stop])
tweets_df.head(10)
tweets_df.head()
string.punctuation
from nltk.stem.porter import *
stemmer = PorterStemmer()
tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [stemmer.stem(w) for w in x])
tweets_df.head()
def remove_punct(text):
text = " ".join([char for char in text if char not in string.punctuation])
text = re.sub('[0-9]+', '', text)
return text
tweets_df['tweet_punct'] = tweets_df['Tweets'].apply(lambda x: remove_punct(x))
tweets_df.head()
"""#Data Visualization(Word Cloud)"""
#all_words = ' '.join([text for text in df['Tweet']])
#wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
#plt.figure(figsize=(10, 7))
#plt.imshow(wordcloud, interpolation="bilinear")
#plt.axis('off')
#plt.show()
"""#Get the most frequent words"""
cnt = Counter()
for text in df["Tweet"].values:
for word in text.split():
cnt[word] += 1
cnt.most_common(20)
"""#Using Vader Library to analyse sentiments in Text"""
# !pip install vaderSentiment
"""#Training of Dataset"""
analyzer = SentimentIntensityAnalyzer()
"""#Getting the sentiments label"""
def sentiment_score_compound(sentence):
score = analyzer.polarity_scores(sentence)
return score['compound']
def sentiment_score_pos(sentence):
score = analyzer.polarity_scores(sentence)
return score['pos']
def sentiment_score_neg(sentence):
score = analyzer.polarity_scores(sentence)
return score['neg']
def sentiment_score_neu(sentence):
score = analyzer.polarity_scores(sentence)
return score['neu']
tweets_df["tweets_sent_compound"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_compound(x))
tweets_df["tweets_sent_pos"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_pos(x))
tweets_df["tweets_sent_neg"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_neg(x))
tweets_df.head()
tweets_df.tail()
#wordlist = nltk.FreqDist(all_words)
#word_features = wordlist.keys()
"""#Vectorization"""
cv = CountVectorizer()
tweets_list = []
for tweet in tweets_df["tweet_punct"]:
tweets_list.append(tweet)
len(tweets_list)
tfIdf = TfidfVectorizer(max_features=20000)
X = tweets_df["tweet_punct"]
vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True, use_idf=True, ngram_range=(1, 2))
#len(all_words)
"""#Define Labels(Positive, Negative, Neutral)"""
# negative label is 0
# neutral label is 1
# positive label is 2
def label_value(val):
if val < 0:
return 0
elif val == 0:
return 1
else:
return 2
tweets_df["label"] = tweets_df["tweets_sent_compound"].apply(lambda x: label_value(x))
tweets_df.head()
cv = CountVectorizer(binary=True)
cv.fit(tweets_list)
X = cv.transform(tweets_list)
y = tweets_df["label"].values
"""#Plotting the Label Results"""
# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
plt.rcParams['figure.figsize'] = [10, 8]
for index, Tweets in enumerate(df.index):
x = tweets_df.tweets_sent_pos.loc[Tweets]
y = tweets_df.tweets_sent_neg.loc[Tweets]
plt.scatter(x, y, color='Blue')
plt.title('Safaricom Tweets Sentiment Analysis', fontsize=20)
plt.xlabel('← Negative β€” β€” β€” Neutral β€” β€” β€” Positive β†’', fontsize=15)
plt.ylabel('← Facts β€” β€” β€” β€” β€” β€” β€” Opinions β†’', fontsize=15)
plt.show()
"""#Plotting on a Pie Chart and Bar Chart
"""
# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
tweets_df['label'].value_counts().plot(kind='pie', autopct='%1.0f%%')
plt.show()
tweets_df['label'].value_counts().sort_index().plot.bar()
plt.show()
"""#Classification using SVM"""
# encoder = preprocessing.LabelEncoder()
# X = tfIdf.fit_transform(df['Text'])
# y = df['tweets_sent_compound']
# X.shape
# X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=0)
# encoder = preprocessing.LabelEncoder()
# y_train = encoder.fit_transform(y_train)
# y_test = encoder.fit_transform(y_test)
# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.2, random_state = 0)
epochs = 20
for epoch in range(epochs):
print(f'Epochs: {epoch + 1}')
train_loss = 0
valid_loss = 0
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(tweets_list)
X = ngram_vectorizer.transform(tweets_list)
y = tweets_df["label"].values
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
svm = LinearSVC()
svm.fit(X_train, y_train)
# clf = LinearSVC()
# clf.fit(X_train, y_train)
pred = svm.predict(X_val)
print("Accuracy: ", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))
"""#TF-IDF Vectroization"""
for epoch in range(epochs):
print(f'Epochs: {epoch + 1}')
train_loss = 0
valid_loss = 0
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(tweets_list)
X = tfidf_vectorizer.transform(tweets_list)
y = tweets_df["label"].values
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
svm = LinearSVC()
svm.fit(X_train, y_train)
pred = svm.predict(X_val)
print("Accuracy: ", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))
#print(pred.predict([[0, 1, 2]]))
"""#Classification using Logistic Regression"""
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_val)
print("Accuracy: ", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))
"""#Using TF-IDF Vectorization"""
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(tweets_list)
X = tfidf_vectorizer.transform(tweets_list)
y = tweets_df["label"].values
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_val)
print("Accuracy:", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))
"""#Classification using Naives Bayes"""
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
pred = MNB.predict(X_val)
print(accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))
"""# TF-IDF Vectorization"""
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(tweets_list)
X = tfidf_vectorizer.transform(tweets_list)
y = tweets_df["label"].values
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
pred = MNB.predict(X_val)
print("Accuracy: ", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))
#import numpy as np
from flask import Flask, request, jsonify, render_template
app = Flask(__name__)
@app.route('/')
def home():
return render_template('index.html')
@app.route('/predict',methods=['GET', 'POST'])
def predict():
'''
For rendering results on HTML GUI
'''
#int_features = 'Safaricom is good'
#final_features = [{'Tweet': int_features}]
#dfPrediction = pd.DataFrame(final_features)
#prediction = svm.predict(dfPrediction['Tweet'])
#output = round(prediction[0], 2)
# yg
#return render_template('index.html', prediction_text='The tweet is {}'.format(output))
if request.method == "POST":
# getting input with name = lname in HTML form
tweetPredict = request.form.get("tweet")
prediction = svm.predict([str[np.array(tweetPredict)]])
output = round(prediction[0], 2)
#return "The tweet is " + tweetPredict
return render_template("index.html", prediction_text='The tweet is {}'.format(output))
@app.route('/predict_api',methods=['POST'])
def predict_api():
'''
For direct API calls trought request
'''
data = request.get_json(force=True)
prediction = svm.predict([np.array(list(data.values()))])
output = prediction[0]
return jsonify(output)
if __name__ == "__main__":
app.run(debug=True)