Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Homework05.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1UY5nOy6oxpblrAJFEKZOgbw0jIBl7vUn | |
# **Part I: Apply Classification methods on Text Classification Dataset** | |
**Develop a Machine Learning workflow for text classification using machine learning models. The following questions should be completed in the Jupyter Notebook.** | |
**Task 1: (10 points) We have Homework05 progress discussion (Homework05_discussion) due on Wednesday (Oct 26) to report what progress you/your group have achieved. Everyone needs to submit a report (at least 100 words), including a progress description for Task 2-4 and plans for to remaining questions.** | |
**Task 2: (5 points) Prepare the dataset from Lab06-A** | |
**Requirement: You must follow steps in (Lab06-PartA: Bag-of-Words for Text Processing and Feature Extraction) to generate the word count tables using Bag-of-Words techniques for the combination of IMDb, Amazon, and Yelp datasets.** | |
""" | |
cd /content/drive/MyDrive/Colab Notebooks/sentiment labelled sentences/sentiment labelled sentences | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
yelp_df = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t') | |
amazon_df = pd.read_csv('amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t') | |
imdb_df = pd.read_csv('imdb_labelled.txt', names=['sentence', 'label'], sep='\t') | |
print("Yelp shape : ", yelp_df.shape) | |
print("Amazon shape : ", amazon_df.shape) | |
print("imdb shape : ", imdb_df.shape) | |
con_label = [yelp_df, amazon_df, imdb_df] | |
input_df = pd.concat(con_label, ignore_index=True) | |
print("input shape : ", input_df.shape) | |
input_df.hist() | |
"""**Task 3: (5 points) Dividing the full dataset into separate training and test dataset**""" | |
x_train, x_test, y_train, y_test = train_test_split(input_df['sentence'], input_df['label'], test_size=0.2, random_state=42) | |
y0=[] | |
y0 = y_train==0 | |
print(len(y0)) | |
"""**Task 4: (5 points) Report the frequency of classes (positive, negative classes) in train, and test set. Are they balanced?**""" | |
plt.subplot(1,2,1) | |
y_train.hist() | |
plt.subplot(1,2,2) | |
y_test.hist() | |
x_train = x_train.to_list() | |
from sklearn.feature_extraction.text import CountVectorizer | |
vectorizer = CountVectorizer(min_df=0, lowercase=False,stop_words='english') | |
vectorizer.fit(x_train) | |
print("Vocabulary: ",vectorizer.vocabulary_) | |
print("Vocabulary words: ",vectorizer.vocabulary_.keys()) | |
print("Vocabulary index: ",vectorizer.vocabulary_.values()) | |
x_train = vectorizer.transform(x_train).toarray() | |
x_test = vectorizer.transform(x_test).toarray() | |
print("Training matrix shape", x_train.shape) | |
print("Testing matrix shape", x_test.shape) | |
from sklearn.preprocessing import StandardScaler | |
standardscaler=StandardScaler() | |
x_train_scale= standardscaler.fit_transform(x_train) | |
x_test_scale= standardscaler.transform(x_test) | |
"""## **Logistic regression**""" | |
from sklearn.linear_model import LogisticRegression | |
lr = LogisticRegression(random_state=0).fit(x_train_scale, y_train) | |
from sklearn.model_selection import cross_val_score | |
cv_scores_lr = cross_val_score(estimator = lr, X = x_train_scale, y = y_train, cv = 10, scoring = 'accuracy') | |
y_pred = lr.predict(x_test_scale) | |
from sklearn import metrics | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import precision_score,recall_score,f1_score | |
print("Accuracy of test dataset: ", accuracy_score(y_test,y_pred )) | |
print("Precision of test dataset: ", precision_score(y_test, y_pred)) | |
print("Recall of test dataset: ", recall_score(y_test, y_pred)) | |
print("F1-Score of test dataset: ", f1_score(y_test, y_pred)) | |
from sklearn.model_selection import cross_val_predict | |
y_scores_lr = cross_val_predict(lr, x_test, y_test, cv=10, method="predict_proba") | |
y_scores_lr_new=y_scores_lr[:,1] | |
from sklearn.metrics import roc_curve | |
fpr, tpr, thresholds = roc_curve(y_test, y_scores_lr_new) | |
plt.plot(fpr, tpr, linewidth=2, label='Logistic Regression') | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.xlabel('False Positive Rate -> (1-Specificity)') | |
plt.ylabel('True Positive Rate -> (Recall)') | |
plt.legend(loc='lower right') | |
plt.show() | |
"""## **Task 9.1: Linear discriminant analysis:**""" | |
import numpy as np | |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
clf = LinearDiscriminantAnalysis() | |
clf.fit(x_train, y_train) | |
from sklearn.model_selection import cross_val_score | |
CV_scores_clf = cross_val_score(estimator = clf, X = x_train, y = y_train, cv = 10, scoring = 'accuracy') | |
print("CV_scores: ", CV_scores_clf) | |
plt.boxplot(CV_scores_clf) | |
plt.title("10-fold cross validation accuracy") | |
plt.xlabel("linear discriminative analysis") | |
plt.ylabel("Accuracy") | |
y_test_pred = clf.predict(x_test) | |
from sklearn import metrics | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import precision_score,recall_score,f1_score | |
print("Accuracy: ", metrics.accuracy_score(y_test_pred,y_test)) | |
print("Precision:",precision_score(y_test_pred.astype(int), y_test.astype(int))) | |
print("recall_score:",recall_score(y_test_pred.astype(int), y_test.astype(int))) | |
print("f1_score:",f1_score(y_test_pred.astype(int), y_test.astype(int))) | |
from sklearn.model_selection import cross_val_predict | |
y_scores_clf = cross_val_predict(clf, x_test, y_test, cv=10, method="predict_proba") | |
y_scores_clf_new=y_scores_clf[:,1] | |
from sklearn.metrics import roc_curve | |
fpr, tpr, thresholds = roc_curve(y_test, y_scores_clf_new) | |
plt.plot(fpr, tpr, linewidth=2, label='Linear discriminative analysis') | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.xlabel('False Positive Rate -> (1-Specificity)') | |
plt.ylabel('True Positive Rate -> (Recall)') | |
plt.legend(loc='lower right') | |
plt.show() | |
"""## **Task 9.2: Quadratic discriminant analysis**""" | |
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis | |
qda=QuadraticDiscriminantAnalysis() | |
qda.fit(x_train,y_train) | |
from sklearn.model_selection import cross_val_score | |
CV_scores_qda = cross_val_score(estimator = qda, X = x_train, y = y_train, cv = 10, scoring = 'accuracy') | |
print("CV_scores: ", CV_scores_qda) | |
plt.boxplot(CV_scores_qda) | |
plt.title("10-fold cross validation accuracy") | |
plt.xlabel("quadratic discriminant analysis") | |
plt.ylabel("Accuracy") | |
y_test_pred1 = qda.predict(x_test) | |
print("Accuracy: ", metrics.accuracy_score(y_test_pred1,y_test)) | |
print("Precision:",precision_score(y_test_pred1.astype(int), y_test.astype(int))) | |
print("recall_score:",recall_score(y_test_pred1.astype(int), y_test.astype(int))) | |
print("f1_score:",f1_score(y_test_pred1.astype(int), y_test.astype(int))) | |
y_scores_qda = cross_val_predict(qda, x_test, y_test, cv=10, method="predict_proba") | |
y_scores_qda_new=y_scores_qda[:,0] | |
fpr, tpr, thresholds = roc_curve(y_test, y_scores_qda_new) | |
plt.plot(fpr, tpr, linewidth=2, label='quadratic discriminative analysis') | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.xlabel('False Positive Rate -> (1-Specificity)') | |
plt.ylabel('True Positive Rate -> (Recall)') | |
plt.legend(loc='lower right') | |
plt.show() | |
"""## **Task 9.3: Naive bayes model (optimal choice for text classification)**""" | |
from sklearn.naive_bayes import MultinomialNB | |
nbm=MultinomialNB() | |
nbm.fit(x_train,y_train) | |
from sklearn.model_selection import cross_val_score | |
CV_scores_nbm = cross_val_score(estimator = nbm, X = x_train, y = y_train, cv = 10, scoring = 'accuracy') | |
print("CV_scores: ", CV_scores_nbm) | |
plt.boxplot(CV_scores_nbm) | |
plt.title("10-fold cross validation accuracy") | |
plt.xlabel("naive bayes analysis") | |
plt.ylabel("Accuracy") | |
y_test_pred2= nbm.predict(x_test) | |
print("Accuracy: ", metrics.accuracy_score(y_test_pred2,y_test)) | |
print("Precision:",precision_score(y_test_pred2.astype(int), y_test.astype(int))) | |
print("recall_score:",recall_score(y_test_pred2.astype(int), y_test.astype(int))) | |
print("f1_score:",f1_score(y_test_pred2.astype(int), y_test.astype(int))) | |
y_scores_nbm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba") | |
y_scores_nbm_new=y_scores_nbm[:,1] | |
fpr, tpr, thresholds = roc_curve(y_test, y_scores_nbm_new) | |
plt.plot(fpr, tpr, linewidth=2, label='naive bayes analysis') | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.xlabel('False Positive Rate -> (1-Specificity)') | |
plt.ylabel('True Positive Rate -> (Recall)') | |
plt.legend(loc='lower right') | |
plt.show() | |
"""## **Task 9.4: Support Vector Machine**""" | |
from sklearn.svm import SVC | |
svm=SVC(probability=True) | |
svm.fit(x_train,y_train) | |
from sklearn.model_selection import cross_val_score | |
CV_scores_svm = cross_val_score(estimator = svm, X = x_train, y = y_train, cv = 2, scoring = 'accuracy') | |
print("CV_scores: ", CV_scores_svm) | |
plt.boxplot(CV_scores_svm) | |
plt.title("10-fold cross validation accuracy") | |
plt.xlabel("Support Vector Machine") | |
plt.ylabel("Accuracy") | |
y_test_pred3= svm.predict(x_test) | |
print("Accuracy: ", metrics.accuracy_score(y_test_pred3,y_test)) | |
print("Precision:",precision_score(y_test_pred3.astype(int), y_test.astype(int))) | |
print("recall_score:",recall_score(y_test_pred3.astype(int), y_test.astype(int))) | |
print("f1_score:",f1_score(y_test_pred3.astype(int), y_test.astype(int))) | |
y_scores_svm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba") | |
y_scores_svm_new=y_scores_nbm[:,1] | |
fpr, tpr, thresholds = roc_curve(y_test, y_scores_svm_new) | |
plt.plot(fpr, tpr, linewidth=2, label='Support vector machine') | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.xlabel('False Positive Rate -> (1-Specificity)') | |
plt.ylabel('True Positive Rate -> (Recall)') | |
plt.legend(loc='lower right') | |
plt.show() | |
"""**Task 10: (Bonus 10 points) How to improve the classification accuracy?** | |
from sklearn.preprocessing import StandardScaler | |
standardscaler=StandardScaler() | |
x_train_scale= standardscaler.fit_transform(x_train) | |
x_test_scale= standardscaler.fit_transform(x_test) | |
-->By using this we can improve accuracy. | |
## **Part II (20 points): Deploy the machine learning models on Gradio or huggingface** | |
""" | |
##!pip install --quiet gradio | |
from gradio.outputs import Label | |
import gradio as gr | |
##import tensorflow as tf | |
def caption(input_module,input_module1): | |
class_a = ["Negative Comment", "Positive Comment"] | |
input_mod=[input_module] | |
input_module= vectorizer.transform(input_mod).toarray() | |
if input_module1==("Logistic Regression"): | |
output1=lr.predict(input_module) | |
predictions=lr.predict_proba(input_module)[0] | |
elif input_module1==("Linear discriminant analysis"): | |
output1=clf.predict(input_module) | |
predictions=clf.predict_proba(input_module)[0] | |
elif input_module1==("Quadratic discriminant analysis"): | |
output1=qda.predict(input_module) | |
predictions=qda.predict_proba(input_module)[0] | |
elif input_module1==("Naive Bayes classifier"): | |
output1=nbm.predict(input_module) | |
predictions=nbm.predict_proba(input_module)[0] | |
elif input_module1==("Support Vector Machine"): | |
output1=svm.predict(input_module) | |
predictions=svm.predict_proba(input_module)[0] | |
print(predictions.shape) | |
output2={} | |
if output1==0: | |
output1="Negative comment" | |
else: | |
output1="Positive comment" | |
for i in range(len(predictions)): | |
output2[class_a[i]] = predictions[i] | |
return output1,output2 | |
input_module= gr.inputs.Textbox(label = "Review comment") | |
input_module1= gr.inputs.Dropdown(choices=["Logistic Regression","Linear discriminant analysis", "Quadratic discriminant analysis","Naive Bayes classifier","Support Vecotr Machine"], label = "Method") | |
output1 = gr.outputs.Textbox(label = "Predicted Class") | |
output2=gr.outputs.Label(label= "probability of class") | |
gr.Interface(fn=caption, inputs=[input_module,input_module1], outputs=[output1,output2]).launch(debug=True) |