# -*- coding: utf-8 -*- """SafaricomProject.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Q0IBBWS6EJsk7j1mGoRghqpR-dePQ3yi """ import pip # pip install pandas import numpy as np import pandas as pd from pip._internal.operations.install.legacy import install # Read csv file into a pandas dataframe # from google.colab import files # uploaded = files.upload() import emoji import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string import matplotlib.pyplot as plt import re #from wordcloud import WordCloud from collections import Counter from sklearn.cluster import KMeans from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import TfidfVectorizer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB # Reading Dataset df = pd.read_csv('safaricomDataset.csv') df.head() df.columns df.shape tweets_df = df[["Date", "User", "Tweet"]] tweets_df.head() # from sklearn import utils tweets_df.shape """#Preprocessing and Cleaning of the Dataset """ nltk.download('punkt') # pip install emoji # import re # import emoji def tokenize_tweets(text): # remove emojis text = emoji.demojize(text) # remove urls text = re.sub('http[s]?://\S+', '', text) # remove punctuations text = re.sub(r'[^\w\s]', '', text) # strip numbers text = re.sub('[0-9]+', '', text) text = word_tokenize(text) return text tweets_df["Tweets"] = tweets_df["Tweet"].apply(lambda x: tokenize_tweets(x)) nltk.download('stopwords') stop = stopwords.words("english") tweets_df["stop_words"] = tweets_df["Tweets"].apply(lambda x: [w for w in x if w in stop]) tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [w.lower() for w in x if w not in stop]) tweets_df.head(10) tweets_df.head() string.punctuation from nltk.stem.porter import * stemmer = PorterStemmer() tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [stemmer.stem(w) for w in x]) tweets_df.head() def remove_punct(text): text = " ".join([char for char in text if char not in string.punctuation]) text = re.sub('[0-9]+', '', text) return text tweets_df['tweet_punct'] = tweets_df['Tweets'].apply(lambda x: remove_punct(x)) tweets_df.head() """#Data Visualization(Word Cloud)""" #all_words = ' '.join([text for text in df['Tweet']]) #wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) #plt.figure(figsize=(10, 7)) #plt.imshow(wordcloud, interpolation="bilinear") #plt.axis('off') #plt.show() """#Get the most frequent words""" cnt = Counter() for text in df["Tweet"].values: for word in text.split(): cnt[word] += 1 cnt.most_common(20) """#Using Vader Library to analyse sentiments in Text""" # !pip install vaderSentiment """#Training of Dataset""" analyzer = SentimentIntensityAnalyzer() """#Getting the sentiments label""" def sentiment_score_compound(sentence): score = analyzer.polarity_scores(sentence) return score['compound'] def sentiment_score_pos(sentence): score = analyzer.polarity_scores(sentence) return score['pos'] def sentiment_score_neg(sentence): score = analyzer.polarity_scores(sentence) return score['neg'] def sentiment_score_neu(sentence): score = analyzer.polarity_scores(sentence) return score['neu'] tweets_df["tweets_sent_compound"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_compound(x)) tweets_df["tweets_sent_pos"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_pos(x)) tweets_df["tweets_sent_neg"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_neg(x)) tweets_df.head() tweets_df.tail() #wordlist = nltk.FreqDist(all_words) #word_features = wordlist.keys() """#Vectorization""" cv = CountVectorizer() tweets_list = [] for tweet in tweets_df["tweet_punct"]: tweets_list.append(tweet) len(tweets_list) tfIdf = TfidfVectorizer(max_features=20000) X = tweets_df["tweet_punct"] vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True, use_idf=True, ngram_range=(1, 2)) #len(all_words) """#Define Labels(Positive, Negative, Neutral)""" # negative label is 0 # neutral label is 1 # positive label is 2 def label_value(val): if val < 0: return 0 elif val == 0: return 1 else: return 2 tweets_df["label"] = tweets_df["tweets_sent_compound"].apply(lambda x: label_value(x)) tweets_df.head() cv = CountVectorizer(binary=True) cv.fit(tweets_list) X = cv.transform(tweets_list) y = tweets_df["label"].values """#Plotting the Label Results""" # Commented out IPython magic to ensure Python compatibility. # %matplotlib inline plt.rcParams['figure.figsize'] = [10, 8] for index, Tweets in enumerate(df.index): x = tweets_df.tweets_sent_pos.loc[Tweets] y = tweets_df.tweets_sent_neg.loc[Tweets] plt.scatter(x, y, color='Blue') plt.title('Safaricom Tweets Sentiment Analysis', fontsize=20) plt.xlabel('← Negative — — — Neutral — — — Positive →', fontsize=15) plt.ylabel('← Facts — — — — — — — Opinions →', fontsize=15) plt.show() """#Plotting on a Pie Chart and Bar Chart """ # Commented out IPython magic to ensure Python compatibility. # %matplotlib inline tweets_df['label'].value_counts().plot(kind='pie', autopct='%1.0f%%') plt.show() tweets_df['label'].value_counts().sort_index().plot.bar() plt.show() """#Classification using SVM""" # encoder = preprocessing.LabelEncoder() # X = tfIdf.fit_transform(df['Text']) # y = df['tweets_sent_compound'] # X.shape # X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=0) # encoder = preprocessing.LabelEncoder() # y_train = encoder.fit_transform(y_train) # y_test = encoder.fit_transform(y_test) # X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.2, random_state = 0) epochs = 20 for epoch in range(epochs): print(f'Epochs: {epoch + 1}') train_loss = 0 valid_loss = 0 ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3)) ngram_vectorizer.fit(tweets_list) X = ngram_vectorizer.transform(tweets_list) y = tweets_df["label"].values X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0) svm = LinearSVC() svm.fit(X_train, y_train) # clf = LinearSVC() # clf.fit(X_train, y_train) pred = svm.predict(X_val) print("Accuracy: ", accuracy_score(y_val, pred)) print(classification_report(y_val, pred)) print(confusion_matrix(y_val, pred)) """#TF-IDF Vectroization""" for epoch in range(epochs): print(f'Epochs: {epoch + 1}') train_loss = 0 valid_loss = 0 tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer.fit(tweets_list) X = tfidf_vectorizer.transform(tweets_list) y = tweets_df["label"].values X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0) svm = LinearSVC() svm.fit(X_train, y_train) pred = svm.predict(X_val) print("Accuracy: ", accuracy_score(y_val, pred)) print(classification_report(y_val, pred)) print(confusion_matrix(y_val, pred)) #print(pred.predict([[0, 1, 2]])) """#Classification using Logistic Regression""" lr = LogisticRegression() lr.fit(X_train, y_train) pred = lr.predict(X_val) print("Accuracy: ", accuracy_score(y_val, pred)) print(classification_report(y_val, pred)) print(confusion_matrix(y_val, pred)) """#Using TF-IDF Vectorization""" tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer.fit(tweets_list) X = tfidf_vectorizer.transform(tweets_list) y = tweets_df["label"].values X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0) lr = LogisticRegression() lr.fit(X_train, y_train) pred = lr.predict(X_val) print("Accuracy:", accuracy_score(y_val, pred)) print(classification_report(y_val, pred)) print(confusion_matrix(y_val, pred)) """#Classification using Naives Bayes""" MNB = MultinomialNB() MNB.fit(X_train, y_train) pred = MNB.predict(X_val) print(accuracy_score(y_val, pred)) print(classification_report(y_val, pred)) print(confusion_matrix(y_val, pred)) """# TF-IDF Vectorization""" tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer.fit(tweets_list) X = tfidf_vectorizer.transform(tweets_list) y = tweets_df["label"].values X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0) MNB = MultinomialNB() MNB.fit(X_train, y_train) pred = MNB.predict(X_val) print("Accuracy: ", accuracy_score(y_val, pred)) print(classification_report(y_val, pred)) print(confusion_matrix(y_val, pred)) #import numpy as np from flask import Flask, request, jsonify, render_template app = Flask(__name__) @app.route('/') def home(): return render_template('index.html') @app.route('/predict',methods=['GET', 'POST']) def predict(): ''' For rendering results on HTML GUI ''' #int_features = 'Safaricom is good' #final_features = [{'Tweet': int_features}] #dfPrediction = pd.DataFrame(final_features) #prediction = svm.predict(dfPrediction['Tweet']) #output = round(prediction[0], 2) # yg #return render_template('index.html', prediction_text='The tweet is {}'.format(output)) if request.method == "POST": # getting input with name = lname in HTML form tweetPredict = request.form.get("tweet") prediction = svm.predict([str[np.array(tweetPredict)]]) output = round(prediction[0], 2) #return "The tweet is " + tweetPredict return render_template("index.html", prediction_text='The tweet is {}'.format(output)) @app.route('/predict_api',methods=['POST']) def predict_api(): ''' For direct API calls trought request ''' data = request.get_json(force=True) prediction = svm.predict([np.array(list(data.values()))]) output = prediction[0] return jsonify(output) if __name__ == "__main__": app.run(debug=True)