File size: 11,661 Bytes
830390f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5eef608
830390f
 
 
5eef608
830390f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# -*- coding: utf-8 -*-
"""Homework05.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1UY5nOy6oxpblrAJFEKZOgbw0jIBl7vUn

# **Part I: Apply Classification methods on Text Classification Dataset**

**Develop a Machine Learning workflow for text classification using machine learning models. The following questions should be completed in the Jupyter Notebook.**

**Task 1: (10 points)  We have Homework05 progress discussion (Homework05_discussion) due on Wednesday (Oct 26) to report what progress you/your group have achieved. Everyone needs to submit a report (at least 100 words), including a progress description for Task 2-4 and plans for to remaining questions.**

**Task 2: (5 points)  Prepare the dataset from Lab06-A**

**Requirement: You must follow steps in (Lab06-PartA: Bag-of-Words for Text Processing and Feature Extraction) to generate the word count tables using Bag-of-Words techniques for the combination of IMDb, Amazon, and Yelp datasets.**
"""

cd /content/drive/MyDrive/Colab Notebooks/sentiment labelled sentences/sentiment labelled sentences

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

yelp_df = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
amazon_df = pd.read_csv('amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t')
imdb_df = pd.read_csv('imdb_labelled.txt', names=['sentence', 'label'], sep='\t')
print("Yelp shape : ", yelp_df.shape)
print("Amazon shape : ", amazon_df.shape)
print("imdb shape : ", imdb_df.shape)

con_label = [yelp_df, amazon_df, imdb_df]
input_df = pd.concat(con_label, ignore_index=True)
print("input shape : ", input_df.shape)

input_df.hist()

"""**Task 3:  (5 points) Dividing the full dataset into separate training and test dataset**"""

x_train, x_test, y_train, y_test = train_test_split(input_df['sentence'], input_df['label'], test_size=0.2, random_state=42)
y0=[]
y0 = y_train==0
print(len(y0))

"""**Task 4: (5 points) Report the frequency of classes (positive, negative classes) in train, and test set. Are they balanced?**"""

plt.subplot(1,2,1)
y_train.hist()

plt.subplot(1,2,2)
y_test.hist()

x_train = x_train.to_list()

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False,stop_words='english') 
vectorizer.fit(x_train)

print("Vocabulary: ",vectorizer.vocabulary_)
print("Vocabulary words: ",vectorizer.vocabulary_.keys())
print("Vocabulary index: ",vectorizer.vocabulary_.values())

x_train = vectorizer.transform(x_train).toarray()
x_test = vectorizer.transform(x_test).toarray()
print("Training matrix shape", x_train.shape)
print("Testing matrix shape", x_test.shape)

from sklearn.preprocessing import StandardScaler
standardscaler=StandardScaler()
x_train_scale= standardscaler.fit_transform(x_train)
x_test_scale= standardscaler.transform(x_test)

"""## **Logistic regression**"""

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0).fit(x_train_scale, y_train)

from sklearn.model_selection import cross_val_score
cv_scores_lr = cross_val_score(estimator = lr, X = x_train_scale, y = y_train, cv = 10, scoring = 'accuracy')

y_pred = lr.predict(x_test_scale)

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score,f1_score
print("Accuracy of test dataset: ", accuracy_score(y_test,y_pred ))
print("Precision of test dataset: ", precision_score(y_test, y_pred))
print("Recall of test dataset: ", recall_score(y_test, y_pred))
print("F1-Score of test dataset: ", f1_score(y_test, y_pred))

from sklearn.model_selection import cross_val_predict
y_scores_lr = cross_val_predict(lr, x_test, y_test, cv=10, method="predict_proba")
y_scores_lr_new=y_scores_lr[:,1]

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores_lr_new)

plt.plot(fpr, tpr, linewidth=2, label='Logistic Regression')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()

"""## **Task 9.1: Linear discriminant analysis:**"""

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()

clf.fit(x_train, y_train)

from sklearn.model_selection import cross_val_score
CV_scores_clf = cross_val_score(estimator = clf, X = x_train, y = y_train, cv = 10, scoring = 'accuracy')
print("CV_scores: ", CV_scores_clf)

plt.boxplot(CV_scores_clf)
plt.title("10-fold cross validation accuracy")
plt.xlabel("linear discriminative analysis")
plt.ylabel("Accuracy")

y_test_pred = clf.predict(x_test)
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score,f1_score
print("Accuracy: ", metrics.accuracy_score(y_test_pred,y_test))
print("Precision:",precision_score(y_test_pred.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred.astype(int), y_test.astype(int)))

from sklearn.model_selection import cross_val_predict
y_scores_clf = cross_val_predict(clf, x_test, y_test, cv=10, method="predict_proba")
y_scores_clf_new=y_scores_clf[:,1]

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores_clf_new)

plt.plot(fpr, tpr, linewidth=2, label='Linear discriminative analysis')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()

"""## **Task 9.2: Quadratic discriminant analysis**"""

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda=QuadraticDiscriminantAnalysis()
qda.fit(x_train,y_train)

from sklearn.model_selection import cross_val_score
CV_scores_qda = cross_val_score(estimator = qda, X = x_train, y = y_train, cv = 10, scoring = 'accuracy')
print("CV_scores: ", CV_scores_qda)

plt.boxplot(CV_scores_qda)
plt.title("10-fold cross validation accuracy")
plt.xlabel("quadratic discriminant analysis")
plt.ylabel("Accuracy")

y_test_pred1 = qda.predict(x_test)
print("Accuracy: ", metrics.accuracy_score(y_test_pred1,y_test))
print("Precision:",precision_score(y_test_pred1.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred1.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred1.astype(int), y_test.astype(int)))

y_scores_qda = cross_val_predict(qda, x_test, y_test, cv=10, method="predict_proba")
y_scores_qda_new=y_scores_qda[:,0]
fpr, tpr, thresholds = roc_curve(y_test, y_scores_qda_new)

plt.plot(fpr, tpr, linewidth=2, label='quadratic discriminative analysis')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()

"""## **Task 9.3: Naive bayes model (optimal choice for text classification)**"""

from sklearn.naive_bayes import MultinomialNB
nbm=MultinomialNB()
nbm.fit(x_train,y_train)

from sklearn.model_selection import cross_val_score
CV_scores_nbm = cross_val_score(estimator = nbm, X = x_train, y = y_train, cv = 10, scoring = 'accuracy')
print("CV_scores: ", CV_scores_nbm)

plt.boxplot(CV_scores_nbm)
plt.title("10-fold cross validation accuracy")
plt.xlabel("naive bayes analysis")
plt.ylabel("Accuracy")

y_test_pred2= nbm.predict(x_test)
print("Accuracy: ", metrics.accuracy_score(y_test_pred2,y_test))
print("Precision:",precision_score(y_test_pred2.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred2.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred2.astype(int), y_test.astype(int)))

y_scores_nbm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba")
y_scores_nbm_new=y_scores_nbm[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores_nbm_new)

plt.plot(fpr, tpr, linewidth=2, label='naive bayes analysis')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()

"""## **Task 9.4: Support Vector Machine**"""

from sklearn.svm import SVC
svm=SVC(probability=True)
svm.fit(x_train,y_train)

from sklearn.model_selection import cross_val_score
CV_scores_svm = cross_val_score(estimator = svm, X = x_train, y = y_train, cv = 2, scoring = 'accuracy')
print("CV_scores: ", CV_scores_svm)

plt.boxplot(CV_scores_svm)
plt.title("10-fold cross validation accuracy")
plt.xlabel("Support Vector Machine")
plt.ylabel("Accuracy")

y_test_pred3= svm.predict(x_test)
print("Accuracy: ", metrics.accuracy_score(y_test_pred3,y_test))
print("Precision:",precision_score(y_test_pred3.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred3.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred3.astype(int), y_test.astype(int)))

y_scores_svm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba")
y_scores_svm_new=y_scores_nbm[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores_svm_new)

plt.plot(fpr, tpr, linewidth=2, label='Support vector machine')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()

"""**Task 10: (Bonus 10 points) How to improve the classification accuracy?**

from sklearn.preprocessing import StandardScaler

standardscaler=StandardScaler()

x_train_scale= standardscaler.fit_transform(x_train)

x_test_scale= standardscaler.fit_transform(x_test)

-->By using this we can improve accuracy.

## **Part II (20 points):  Deploy the machine learning models on Gradio or huggingface**
"""

##!pip install --quiet gradio

from gradio.outputs import Label
import gradio as gr
##import tensorflow as tf

def caption(input_module,input_module1):

  class_a = ["Negative Comment", "Positive Comment"]
  
  input_mod=[input_module]
  input_module= vectorizer.transform(input_mod).toarray()
  
  if input_module1==("Logistic Regression"):
    output1=lr.predict(input_module)
    predictions=lr.predict_proba(input_module)[0]
    
  elif input_module1==("Linear discriminant analysis"):
    output1=clf.predict(input_module)
    predictions=clf.predict_proba(input_module)[0]
    
  elif input_module1==("Quadratic discriminant analysis"):
    output1=qda.predict(input_module)
    predictions=qda.predict_proba(input_module)[0]
    
  elif input_module1==("Naive Bayes classifier"):
    output1=nbm.predict(input_module)
    predictions=nbm.predict_proba(input_module)[0]

  elif input_module1==("Support Vector Machine"):
    output1=svm.predict(input_module)
    predictions=svm.predict_proba(input_module)[0]
  print(predictions.shape)
  output2={}

  if output1==0:
    output1="Negative comment"
  else:
    output1="Positive comment"
  for i in range(len(predictions)):
    output2[class_a[i]] = predictions[i] 
  return output1,output2

input_module= gr.inputs.Textbox(label = "Review comment")
input_module1= gr.inputs.Dropdown(choices=["Logistic Regression","Linear discriminant analysis", "Quadratic discriminant analysis","Naive Bayes classifier","Support Vecotr Machine"], label = "Method")

output1 = gr.outputs.Textbox(label = "Predicted Class")
output2=gr.outputs.Label(label= "probability of class")
 
gr.Interface(fn=caption, inputs=[input_module,input_module1], outputs=[output1,output2]).launch(debug=True)