JoanWaweru commited on
Commit
48f3dfc
β€’
1 Parent(s): e24f323

Upload 7 files

Browse files
Files changed (7) hide show
  1. README.md +5 -0
  2. SafaricomProject.ipynb +0 -0
  3. app.py +36 -0
  4. dataset.py +16 -0
  5. request.py +6 -0
  6. safaricomDataset.csv +0 -0
  7. safaricomproject.py +396 -0
README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # IS2Project
2
+
3
+ This is a Customer Sentiment Analysis for Code-Switched Language: A Case of Safaricom Limited. The proposed model will be able to detect customer sentiment analysis in the code-switched pair (English-Swahili) for Safaricom users using Support Vector Machines. The model will be able to categorize tweets into good reviews and bad reviews.
4
+
5
+ The model is also compared with Logistic Regression and Naives Bayes to see which model performs the best.
SafaricomProject.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from flask import Flask, request, jsonify, render_template
3
+ import safaricomproject
4
+
5
+ app = Flask(__name__)
6
+
7
+ @app.route('/')
8
+ def home():
9
+ return render_template('index.html')
10
+
11
+ @app.route('/predict',methods=['POST'])
12
+ def predict():
13
+ '''
14
+ For rendering results on HTML GUI
15
+ '''
16
+ int_features = [str(x) for x in request.form.values()]
17
+ final_features = str([np.array(int_features)])
18
+ prediction = svm.predict(final_features)
19
+
20
+ output = round(prediction[0], 2)
21
+
22
+ return render_template('index.html', prediction_text='The tweet is {}'.format(output))
23
+
24
+ @app.route('/predict_api',methods=['POST'])
25
+ def predict_api():
26
+ '''
27
+ For direct API calls trought request
28
+ '''
29
+ data = request.get_json(force=True)
30
+ prediction = safaricomproject.predict([np.array(list(data.values()))])
31
+
32
+ output = prediction[0]
33
+ return jsonify(output)
34
+
35
+ if __name__ == "__main__":
36
+ app.run(debug=True)
dataset.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import snscrape.modules.twitter as sntwitter
2
+ import pandas as pd
3
+
4
+ query = "(@Safaricom_Care) until:2022-10-24"
5
+ tweets = []
6
+ limits = 5000
7
+ for tweet in sntwitter.TwitterSearchScraper(query).get_items():
8
+ #print(vars(tweet))
9
+ #break
10
+ if len(tweets) == limits:
11
+ break
12
+ else:
13
+ tweets.append([tweet.date, tweet.user.username,tweet.content])
14
+
15
+ df = pd.DataFrame(tweets, columns=['Date', 'User', 'Tweet'])
16
+ df.to_csv('safaricomDataset.csv')
request.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ url = 'http://localhost:5000/predict_api'
4
+ r = requests.post(url,json={'negative':0, 'neutral':1, 'positive':2})
5
+
6
+ print(r.json())
safaricomDataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
safaricomproject.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """SafaricomProject.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Q0IBBWS6EJsk7j1mGoRghqpR-dePQ3yi
8
+
9
+ """
10
+ import pip
11
+
12
+ # pip install pandas
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ from pip._internal.operations.install.legacy import install
17
+
18
+ # Read csv file into a pandas dataframe
19
+ # from google.colab import files
20
+ # uploaded = files.upload()
21
+ import emoji
22
+ import nltk
23
+ from nltk.tokenize import word_tokenize
24
+ from nltk.corpus import stopwords
25
+ import string
26
+ import matplotlib.pyplot as plt
27
+ import re
28
+ #from wordcloud import WordCloud
29
+ from collections import Counter
30
+ from sklearn.cluster import KMeans
31
+ from sklearn.decomposition import LatentDirichletAllocation
32
+ from sklearn.feature_extraction.text import TfidfVectorizer
33
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
34
+ from sklearn.feature_extraction.text import CountVectorizer
35
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
36
+ from sklearn.model_selection import train_test_split
37
+ from sklearn.svm import LinearSVC
38
+ from sklearn.linear_model import LogisticRegression
39
+ from sklearn.naive_bayes import MultinomialNB
40
+
41
+ # Reading Dataset
42
+
43
+ df = pd.read_csv('safaricomDataset.csv')
44
+ df.head()
45
+
46
+ df.columns
47
+
48
+ df.shape
49
+
50
+ tweets_df = df[["Date", "User", "Tweet"]]
51
+ tweets_df.head()
52
+
53
+ # from sklearn import utils
54
+ tweets_df.shape
55
+
56
+ """#Preprocessing and Cleaning of the Dataset """
57
+
58
+ nltk.download('punkt')
59
+
60
+
61
+ # pip install emoji
62
+
63
+ # import re
64
+
65
+ # import emoji
66
+
67
+
68
+ def tokenize_tweets(text):
69
+ # remove emojis
70
+ text = emoji.demojize(text)
71
+ # remove urls
72
+ text = re.sub('http[s]?://\S+', '', text)
73
+ # remove punctuations
74
+ text = re.sub(r'[^\w\s]', '', text)
75
+ # strip numbers
76
+ text = re.sub('[0-9]+', '', text)
77
+ text = word_tokenize(text)
78
+
79
+ return text
80
+
81
+
82
+ tweets_df["Tweets"] = tweets_df["Tweet"].apply(lambda x: tokenize_tweets(x))
83
+
84
+
85
+ nltk.download('stopwords')
86
+ stop = stopwords.words("english")
87
+ tweets_df["stop_words"] = tweets_df["Tweets"].apply(lambda x: [w for w in x if w in stop])
88
+ tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [w.lower() for w in x if w not in stop])
89
+
90
+ tweets_df.head(10)
91
+
92
+ tweets_df.head()
93
+
94
+
95
+ string.punctuation
96
+
97
+ from nltk.stem.porter import *
98
+
99
+ stemmer = PorterStemmer()
100
+ tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [stemmer.stem(w) for w in x])
101
+ tweets_df.head()
102
+
103
+
104
+ def remove_punct(text):
105
+ text = " ".join([char for char in text if char not in string.punctuation])
106
+ text = re.sub('[0-9]+', '', text)
107
+
108
+ return text
109
+
110
+
111
+ tweets_df['tweet_punct'] = tweets_df['Tweets'].apply(lambda x: remove_punct(x))
112
+
113
+ tweets_df.head()
114
+
115
+ """#Data Visualization(Word Cloud)"""
116
+
117
+ #all_words = ' '.join([text for text in df['Tweet']])
118
+
119
+ #wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
120
+
121
+ #plt.figure(figsize=(10, 7))
122
+ #plt.imshow(wordcloud, interpolation="bilinear")
123
+ #plt.axis('off')
124
+ #plt.show()
125
+
126
+ """#Get the most frequent words"""
127
+
128
+ cnt = Counter()
129
+ for text in df["Tweet"].values:
130
+ for word in text.split():
131
+ cnt[word] += 1
132
+
133
+ cnt.most_common(20)
134
+
135
+ """#Using Vader Library to analyse sentiments in Text"""
136
+
137
+ # !pip install vaderSentiment
138
+
139
+ """#Training of Dataset"""
140
+
141
+ analyzer = SentimentIntensityAnalyzer()
142
+
143
+ """#Getting the sentiments label"""
144
+
145
+
146
+ def sentiment_score_compound(sentence):
147
+ score = analyzer.polarity_scores(sentence)
148
+ return score['compound']
149
+
150
+
151
+ def sentiment_score_pos(sentence):
152
+ score = analyzer.polarity_scores(sentence)
153
+ return score['pos']
154
+
155
+
156
+ def sentiment_score_neg(sentence):
157
+ score = analyzer.polarity_scores(sentence)
158
+ return score['neg']
159
+
160
+
161
+ def sentiment_score_neu(sentence):
162
+ score = analyzer.polarity_scores(sentence)
163
+ return score['neu']
164
+
165
+
166
+ tweets_df["tweets_sent_compound"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_compound(x))
167
+ tweets_df["tweets_sent_pos"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_pos(x))
168
+ tweets_df["tweets_sent_neg"] = tweets_df["Tweet"].apply(lambda x: sentiment_score_neg(x))
169
+ tweets_df.head()
170
+
171
+ tweets_df.tail()
172
+
173
+ #wordlist = nltk.FreqDist(all_words)
174
+ #word_features = wordlist.keys()
175
+
176
+ """#Vectorization"""
177
+
178
+ cv = CountVectorizer()
179
+ tweets_list = []
180
+ for tweet in tweets_df["tweet_punct"]:
181
+ tweets_list.append(tweet)
182
+ len(tweets_list)
183
+ tfIdf = TfidfVectorizer(max_features=20000)
184
+
185
+ X = tweets_df["tweet_punct"]
186
+
187
+ vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf=True, use_idf=True, ngram_range=(1, 2))
188
+ #len(all_words)
189
+
190
+ """#Define Labels(Positive, Negative, Neutral)"""
191
+
192
+
193
+ # negative label is 0
194
+ # neutral label is 1
195
+ # positive label is 2
196
+
197
+ def label_value(val):
198
+ if val < 0:
199
+ return 0
200
+ elif val == 0:
201
+ return 1
202
+ else:
203
+ return 2
204
+
205
+
206
+ tweets_df["label"] = tweets_df["tweets_sent_compound"].apply(lambda x: label_value(x))
207
+ tweets_df.head()
208
+
209
+ cv = CountVectorizer(binary=True)
210
+ cv.fit(tweets_list)
211
+ X = cv.transform(tweets_list)
212
+ y = tweets_df["label"].values
213
+
214
+ """#Plotting the Label Results"""
215
+
216
+ # Commented out IPython magic to ensure Python compatibility.
217
+ # %matplotlib inline
218
+ plt.rcParams['figure.figsize'] = [10, 8]
219
+ for index, Tweets in enumerate(df.index):
220
+ x = tweets_df.tweets_sent_pos.loc[Tweets]
221
+ y = tweets_df.tweets_sent_neg.loc[Tweets]
222
+ plt.scatter(x, y, color='Blue')
223
+
224
+ plt.title('Safaricom Tweets Sentiment Analysis', fontsize=20)
225
+ plt.xlabel('← Negative β€” β€” β€” Neutral β€” β€” β€” Positive β†’', fontsize=15)
226
+ plt.ylabel('← Facts β€” β€” β€” β€” β€” β€” β€” Opinions β†’', fontsize=15)
227
+ plt.show()
228
+
229
+ """#Plotting on a Pie Chart and Bar Chart
230
+
231
+ """
232
+
233
+ # Commented out IPython magic to ensure Python compatibility.
234
+
235
+ # %matplotlib inline
236
+ tweets_df['label'].value_counts().plot(kind='pie', autopct='%1.0f%%')
237
+ plt.show()
238
+
239
+ tweets_df['label'].value_counts().sort_index().plot.bar()
240
+ plt.show()
241
+
242
+ """#Classification using SVM"""
243
+
244
+ # encoder = preprocessing.LabelEncoder()
245
+ # X = tfIdf.fit_transform(df['Text'])
246
+ # y = df['tweets_sent_compound']
247
+ # X.shape
248
+
249
+ # X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=0)
250
+ # encoder = preprocessing.LabelEncoder()
251
+ # y_train = encoder.fit_transform(y_train)
252
+ # y_test = encoder.fit_transform(y_test)
253
+
254
+ # X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.2, random_state = 0)
255
+
256
+ epochs = 20
257
+ for epoch in range(epochs):
258
+ print(f'Epochs: {epoch + 1}')
259
+ train_loss = 0
260
+ valid_loss = 0
261
+
262
+ ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
263
+ ngram_vectorizer.fit(tweets_list)
264
+ X = ngram_vectorizer.transform(tweets_list)
265
+ y = tweets_df["label"].values
266
+ X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
267
+ svm = LinearSVC()
268
+ svm.fit(X_train, y_train)
269
+
270
+ # clf = LinearSVC()
271
+ # clf.fit(X_train, y_train)
272
+
273
+ pred = svm.predict(X_val)
274
+ print("Accuracy: ", accuracy_score(y_val, pred))
275
+ print(classification_report(y_val, pred))
276
+ print(confusion_matrix(y_val, pred))
277
+
278
+ """#TF-IDF Vectroization"""
279
+
280
+ for epoch in range(epochs):
281
+ print(f'Epochs: {epoch + 1}')
282
+ train_loss = 0
283
+ valid_loss = 0
284
+
285
+ tfidf_vectorizer = TfidfVectorizer()
286
+ tfidf_vectorizer.fit(tweets_list)
287
+ X = tfidf_vectorizer.transform(tweets_list)
288
+ y = tweets_df["label"].values
289
+
290
+ X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
291
+
292
+ svm = LinearSVC()
293
+ svm.fit(X_train, y_train)
294
+ pred = svm.predict(X_val)
295
+ print("Accuracy: ", accuracy_score(y_val, pred))
296
+ print(classification_report(y_val, pred))
297
+ print(confusion_matrix(y_val, pred))
298
+ #print(pred.predict([[0, 1, 2]]))
299
+
300
+ """#Classification using Logistic Regression"""
301
+
302
+ lr = LogisticRegression()
303
+ lr.fit(X_train, y_train)
304
+
305
+ pred = lr.predict(X_val)
306
+ print("Accuracy: ", accuracy_score(y_val, pred))
307
+ print(classification_report(y_val, pred))
308
+ print(confusion_matrix(y_val, pred))
309
+
310
+ """#Using TF-IDF Vectorization"""
311
+
312
+ tfidf_vectorizer = TfidfVectorizer()
313
+ tfidf_vectorizer.fit(tweets_list)
314
+ X = tfidf_vectorizer.transform(tweets_list)
315
+ y = tweets_df["label"].values
316
+
317
+ X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
318
+
319
+ lr = LogisticRegression()
320
+ lr.fit(X_train, y_train)
321
+
322
+ pred = lr.predict(X_val)
323
+ print("Accuracy:", accuracy_score(y_val, pred))
324
+ print(classification_report(y_val, pred))
325
+ print(confusion_matrix(y_val, pred))
326
+
327
+ """#Classification using Naives Bayes"""
328
+
329
+ MNB = MultinomialNB()
330
+ MNB.fit(X_train, y_train)
331
+ pred = MNB.predict(X_val)
332
+ print(accuracy_score(y_val, pred))
333
+ print(classification_report(y_val, pred))
334
+ print(confusion_matrix(y_val, pred))
335
+
336
+ """# TF-IDF Vectorization"""
337
+
338
+ tfidf_vectorizer = TfidfVectorizer()
339
+ tfidf_vectorizer.fit(tweets_list)
340
+ X = tfidf_vectorizer.transform(tweets_list)
341
+ y = tweets_df["label"].values
342
+
343
+ X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=0)
344
+ MNB = MultinomialNB()
345
+ MNB.fit(X_train, y_train)
346
+ pred = MNB.predict(X_val)
347
+ print("Accuracy: ", accuracy_score(y_val, pred))
348
+ print(classification_report(y_val, pred))
349
+ print(confusion_matrix(y_val, pred))
350
+
351
+ #import numpy as np
352
+ from flask import Flask, request, jsonify, render_template
353
+
354
+ app = Flask(__name__)
355
+
356
+ @app.route('/')
357
+ def home():
358
+ return render_template('index.html')
359
+
360
+ @app.route('/predict',methods=['GET', 'POST'])
361
+ def predict():
362
+ '''
363
+ For rendering results on HTML GUI
364
+ '''
365
+ #int_features = 'Safaricom is good'
366
+ #final_features = [{'Tweet': int_features}]
367
+ #dfPrediction = pd.DataFrame(final_features)
368
+
369
+ #prediction = svm.predict(dfPrediction['Tweet'])
370
+
371
+ #output = round(prediction[0], 2)
372
+
373
+ # yg
374
+ #return render_template('index.html', prediction_text='The tweet is {}'.format(output))
375
+
376
+ if request.method == "POST":
377
+ # getting input with name = lname in HTML form
378
+ tweetPredict = request.form.get("tweet")
379
+ prediction = svm.predict([str[np.array(tweetPredict)]])
380
+ output = round(prediction[0], 2)
381
+ #return "The tweet is " + tweetPredict
382
+ return render_template("index.html", prediction_text='The tweet is {}'.format(output))
383
+
384
+ @app.route('/predict_api',methods=['POST'])
385
+ def predict_api():
386
+ '''
387
+ For direct API calls trought request
388
+ '''
389
+ data = request.get_json(force=True)
390
+ prediction = svm.predict([np.array(list(data.values()))])
391
+
392
+ output = prediction[0]
393
+ return jsonify(output)
394
+
395
+ if __name__ == "__main__":
396
+ app.run(debug=True)