analysis / app.py
Varunjulakanti's picture
initial commit
5eef608
# -*- coding: utf-8 -*-
"""Homework05.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1UY5nOy6oxpblrAJFEKZOgbw0jIBl7vUn
# **Part I: Apply Classification methods on Text Classification Dataset**
**Develop a Machine Learning workflow for text classification using machine learning models. The following questions should be completed in the Jupyter Notebook.**
**Task 1: (10 points) We have Homework05 progress discussion (Homework05_discussion) due on Wednesday (Oct 26) to report what progress you/your group have achieved. Everyone needs to submit a report (at least 100 words), including a progress description for Task 2-4 and plans for to remaining questions.**
**Task 2: (5 points) Prepare the dataset from Lab06-A**
**Requirement: You must follow steps in (Lab06-PartA: Bag-of-Words for Text Processing and Feature Extraction) to generate the word count tables using Bag-of-Words techniques for the combination of IMDb, Amazon, and Yelp datasets.**
"""
cd /content/drive/MyDrive/Colab Notebooks/sentiment labelled sentences/sentiment labelled sentences
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
yelp_df = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
amazon_df = pd.read_csv('amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t')
imdb_df = pd.read_csv('imdb_labelled.txt', names=['sentence', 'label'], sep='\t')
print("Yelp shape : ", yelp_df.shape)
print("Amazon shape : ", amazon_df.shape)
print("imdb shape : ", imdb_df.shape)
con_label = [yelp_df, amazon_df, imdb_df]
input_df = pd.concat(con_label, ignore_index=True)
print("input shape : ", input_df.shape)
input_df.hist()
"""**Task 3: (5 points) Dividing the full dataset into separate training and test dataset**"""
x_train, x_test, y_train, y_test = train_test_split(input_df['sentence'], input_df['label'], test_size=0.2, random_state=42)
y0=[]
y0 = y_train==0
print(len(y0))
"""**Task 4: (5 points) Report the frequency of classes (positive, negative classes) in train, and test set. Are they balanced?**"""
plt.subplot(1,2,1)
y_train.hist()
plt.subplot(1,2,2)
y_test.hist()
x_train = x_train.to_list()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False,stop_words='english')
vectorizer.fit(x_train)
print("Vocabulary: ",vectorizer.vocabulary_)
print("Vocabulary words: ",vectorizer.vocabulary_.keys())
print("Vocabulary index: ",vectorizer.vocabulary_.values())
x_train = vectorizer.transform(x_train).toarray()
x_test = vectorizer.transform(x_test).toarray()
print("Training matrix shape", x_train.shape)
print("Testing matrix shape", x_test.shape)
from sklearn.preprocessing import StandardScaler
standardscaler=StandardScaler()
x_train_scale= standardscaler.fit_transform(x_train)
x_test_scale= standardscaler.transform(x_test)
"""## **Logistic regression**"""
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0).fit(x_train_scale, y_train)
from sklearn.model_selection import cross_val_score
cv_scores_lr = cross_val_score(estimator = lr, X = x_train_scale, y = y_train, cv = 10, scoring = 'accuracy')
y_pred = lr.predict(x_test_scale)
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score,f1_score
print("Accuracy of test dataset: ", accuracy_score(y_test,y_pred ))
print("Precision of test dataset: ", precision_score(y_test, y_pred))
print("Recall of test dataset: ", recall_score(y_test, y_pred))
print("F1-Score of test dataset: ", f1_score(y_test, y_pred))
from sklearn.model_selection import cross_val_predict
y_scores_lr = cross_val_predict(lr, x_test, y_test, cv=10, method="predict_proba")
y_scores_lr_new=y_scores_lr[:,1]
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores_lr_new)
plt.plot(fpr, tpr, linewidth=2, label='Logistic Regression')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()
"""## **Task 9.1: Linear discriminant analysis:**"""
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(x_train, y_train)
from sklearn.model_selection import cross_val_score
CV_scores_clf = cross_val_score(estimator = clf, X = x_train, y = y_train, cv = 10, scoring = 'accuracy')
print("CV_scores: ", CV_scores_clf)
plt.boxplot(CV_scores_clf)
plt.title("10-fold cross validation accuracy")
plt.xlabel("linear discriminative analysis")
plt.ylabel("Accuracy")
y_test_pred = clf.predict(x_test)
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score,f1_score
print("Accuracy: ", metrics.accuracy_score(y_test_pred,y_test))
print("Precision:",precision_score(y_test_pred.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred.astype(int), y_test.astype(int)))
from sklearn.model_selection import cross_val_predict
y_scores_clf = cross_val_predict(clf, x_test, y_test, cv=10, method="predict_proba")
y_scores_clf_new=y_scores_clf[:,1]
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores_clf_new)
plt.plot(fpr, tpr, linewidth=2, label='Linear discriminative analysis')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()
"""## **Task 9.2: Quadratic discriminant analysis**"""
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda=QuadraticDiscriminantAnalysis()
qda.fit(x_train,y_train)
from sklearn.model_selection import cross_val_score
CV_scores_qda = cross_val_score(estimator = qda, X = x_train, y = y_train, cv = 10, scoring = 'accuracy')
print("CV_scores: ", CV_scores_qda)
plt.boxplot(CV_scores_qda)
plt.title("10-fold cross validation accuracy")
plt.xlabel("quadratic discriminant analysis")
plt.ylabel("Accuracy")
y_test_pred1 = qda.predict(x_test)
print("Accuracy: ", metrics.accuracy_score(y_test_pred1,y_test))
print("Precision:",precision_score(y_test_pred1.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred1.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred1.astype(int), y_test.astype(int)))
y_scores_qda = cross_val_predict(qda, x_test, y_test, cv=10, method="predict_proba")
y_scores_qda_new=y_scores_qda[:,0]
fpr, tpr, thresholds = roc_curve(y_test, y_scores_qda_new)
plt.plot(fpr, tpr, linewidth=2, label='quadratic discriminative analysis')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()
"""## **Task 9.3: Naive bayes model (optimal choice for text classification)**"""
from sklearn.naive_bayes import MultinomialNB
nbm=MultinomialNB()
nbm.fit(x_train,y_train)
from sklearn.model_selection import cross_val_score
CV_scores_nbm = cross_val_score(estimator = nbm, X = x_train, y = y_train, cv = 10, scoring = 'accuracy')
print("CV_scores: ", CV_scores_nbm)
plt.boxplot(CV_scores_nbm)
plt.title("10-fold cross validation accuracy")
plt.xlabel("naive bayes analysis")
plt.ylabel("Accuracy")
y_test_pred2= nbm.predict(x_test)
print("Accuracy: ", metrics.accuracy_score(y_test_pred2,y_test))
print("Precision:",precision_score(y_test_pred2.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred2.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred2.astype(int), y_test.astype(int)))
y_scores_nbm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba")
y_scores_nbm_new=y_scores_nbm[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores_nbm_new)
plt.plot(fpr, tpr, linewidth=2, label='naive bayes analysis')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()
"""## **Task 9.4: Support Vector Machine**"""
from sklearn.svm import SVC
svm=SVC(probability=True)
svm.fit(x_train,y_train)
from sklearn.model_selection import cross_val_score
CV_scores_svm = cross_val_score(estimator = svm, X = x_train, y = y_train, cv = 2, scoring = 'accuracy')
print("CV_scores: ", CV_scores_svm)
plt.boxplot(CV_scores_svm)
plt.title("10-fold cross validation accuracy")
plt.xlabel("Support Vector Machine")
plt.ylabel("Accuracy")
y_test_pred3= svm.predict(x_test)
print("Accuracy: ", metrics.accuracy_score(y_test_pred3,y_test))
print("Precision:",precision_score(y_test_pred3.astype(int), y_test.astype(int)))
print("recall_score:",recall_score(y_test_pred3.astype(int), y_test.astype(int)))
print("f1_score:",f1_score(y_test_pred3.astype(int), y_test.astype(int)))
y_scores_svm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba")
y_scores_svm_new=y_scores_nbm[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores_svm_new)
plt.plot(fpr, tpr, linewidth=2, label='Support vector machine')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate -> (1-Specificity)')
plt.ylabel('True Positive Rate -> (Recall)')
plt.legend(loc='lower right')
plt.show()
"""**Task 10: (Bonus 10 points) How to improve the classification accuracy?**
from sklearn.preprocessing import StandardScaler
standardscaler=StandardScaler()
x_train_scale= standardscaler.fit_transform(x_train)
x_test_scale= standardscaler.fit_transform(x_test)
-->By using this we can improve accuracy.
## **Part II (20 points): Deploy the machine learning models on Gradio or huggingface**
"""
##!pip install --quiet gradio
from gradio.outputs import Label
import gradio as gr
##import tensorflow as tf
def caption(input_module,input_module1):
class_a = ["Negative Comment", "Positive Comment"]
input_mod=[input_module]
input_module= vectorizer.transform(input_mod).toarray()
if input_module1==("Logistic Regression"):
output1=lr.predict(input_module)
predictions=lr.predict_proba(input_module)[0]
elif input_module1==("Linear discriminant analysis"):
output1=clf.predict(input_module)
predictions=clf.predict_proba(input_module)[0]
elif input_module1==("Quadratic discriminant analysis"):
output1=qda.predict(input_module)
predictions=qda.predict_proba(input_module)[0]
elif input_module1==("Naive Bayes classifier"):
output1=nbm.predict(input_module)
predictions=nbm.predict_proba(input_module)[0]
elif input_module1==("Support Vector Machine"):
output1=svm.predict(input_module)
predictions=svm.predict_proba(input_module)[0]
print(predictions.shape)
output2={}
if output1==0:
output1="Negative comment"
else:
output1="Positive comment"
for i in range(len(predictions)):
output2[class_a[i]] = predictions[i]
return output1,output2
input_module= gr.inputs.Textbox(label = "Review comment")
input_module1= gr.inputs.Dropdown(choices=["Logistic Regression","Linear discriminant analysis", "Quadratic discriminant analysis","Naive Bayes classifier","Support Vecotr Machine"], label = "Method")
output1 = gr.outputs.Textbox(label = "Predicted Class")
output2=gr.outputs.Label(label= "probability of class")
gr.Interface(fn=caption, inputs=[input_module,input_module1], outputs=[output1,output2]).launch(debug=True)