# -*- coding: utf-8 -*- """Homework05.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1UY5nOy6oxpblrAJFEKZOgbw0jIBl7vUn # **Part I: Apply Classification methods on Text Classification Dataset** **Develop a Machine Learning workflow for text classification using machine learning models. The following questions should be completed in the Jupyter Notebook.** **Task 1: (10 points) We have Homework05 progress discussion (Homework05_discussion) due on Wednesday (Oct 26) to report what progress you/your group have achieved. Everyone needs to submit a report (at least 100 words), including a progress description for Task 2-4 and plans for to remaining questions.** **Task 2: (5 points) Prepare the dataset from Lab06-A** **Requirement: You must follow steps in (Lab06-PartA: Bag-of-Words for Text Processing and Feature Extraction) to generate the word count tables using Bag-of-Words techniques for the combination of IMDb, Amazon, and Yelp datasets.** """ cd /content/drive/MyDrive/Colab Notebooks/sentiment labelled sentences/sentiment labelled sentences import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.model_selection import train_test_split yelp_df = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t') amazon_df = pd.read_csv('amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t') imdb_df = pd.read_csv('imdb_labelled.txt', names=['sentence', 'label'], sep='\t') print("Yelp shape : ", yelp_df.shape) print("Amazon shape : ", amazon_df.shape) print("imdb shape : ", imdb_df.shape) con_label = [yelp_df, amazon_df, imdb_df] input_df = pd.concat(con_label, ignore_index=True) print("input shape : ", input_df.shape) input_df.hist() """**Task 3: (5 points) Dividing the full dataset into separate training and test dataset**""" x_train, x_test, y_train, y_test = train_test_split(input_df['sentence'], input_df['label'], test_size=0.2, random_state=42) y0=[] y0 = y_train==0 print(len(y0)) """**Task 4: (5 points) Report the frequency of classes (positive, negative classes) in train, and test set. Are they balanced?**""" plt.subplot(1,2,1) y_train.hist() plt.subplot(1,2,2) y_test.hist() x_train = x_train.to_list() from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=0, lowercase=False,stop_words='english') vectorizer.fit(x_train) print("Vocabulary: ",vectorizer.vocabulary_) print("Vocabulary words: ",vectorizer.vocabulary_.keys()) print("Vocabulary index: ",vectorizer.vocabulary_.values()) x_train = vectorizer.transform(x_train).toarray() x_test = vectorizer.transform(x_test).toarray() print("Training matrix shape", x_train.shape) print("Testing matrix shape", x_test.shape) from sklearn.preprocessing import StandardScaler standardscaler=StandardScaler() x_train_scale= standardscaler.fit_transform(x_train) x_test_scale= standardscaler.transform(x_test) """## **Logistic regression**""" from sklearn.linear_model import LogisticRegression lr = LogisticRegression(random_state=0).fit(x_train_scale, y_train) from sklearn.model_selection import cross_val_score cv_scores_lr = cross_val_score(estimator = lr, X = x_train_scale, y = y_train, cv = 10, scoring = 'accuracy') y_pred = lr.predict(x_test_scale) from sklearn import metrics from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score,recall_score,f1_score print("Accuracy of test dataset: ", accuracy_score(y_test,y_pred )) print("Precision of test dataset: ", precision_score(y_test, y_pred)) print("Recall of test dataset: ", recall_score(y_test, y_pred)) print("F1-Score of test dataset: ", f1_score(y_test, y_pred)) from sklearn.model_selection import cross_val_predict y_scores_lr = cross_val_predict(lr, x_test, y_test, cv=10, method="predict_proba") y_scores_lr_new=y_scores_lr[:,1] from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, y_scores_lr_new) plt.plot(fpr, tpr, linewidth=2, label='Logistic Regression') plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate -> (1-Specificity)') plt.ylabel('True Positive Rate -> (Recall)') plt.legend(loc='lower right') plt.show() """## **Task 9.1: Linear discriminant analysis:**""" import numpy as np from sklearn.discriminant_analysis import LinearDiscriminantAnalysis clf = LinearDiscriminantAnalysis() clf.fit(x_train, y_train) from sklearn.model_selection import cross_val_score CV_scores_clf = cross_val_score(estimator = clf, X = x_train, y = y_train, cv = 10, scoring = 'accuracy') print("CV_scores: ", CV_scores_clf) plt.boxplot(CV_scores_clf) plt.title("10-fold cross validation accuracy") plt.xlabel("linear discriminative analysis") plt.ylabel("Accuracy") y_test_pred = clf.predict(x_test) from sklearn import metrics from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score,recall_score,f1_score print("Accuracy: ", metrics.accuracy_score(y_test_pred,y_test)) print("Precision:",precision_score(y_test_pred.astype(int), y_test.astype(int))) print("recall_score:",recall_score(y_test_pred.astype(int), y_test.astype(int))) print("f1_score:",f1_score(y_test_pred.astype(int), y_test.astype(int))) from sklearn.model_selection import cross_val_predict y_scores_clf = cross_val_predict(clf, x_test, y_test, cv=10, method="predict_proba") y_scores_clf_new=y_scores_clf[:,1] from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, y_scores_clf_new) plt.plot(fpr, tpr, linewidth=2, label='Linear discriminative analysis') plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate -> (1-Specificity)') plt.ylabel('True Positive Rate -> (Recall)') plt.legend(loc='lower right') plt.show() """## **Task 9.2: Quadratic discriminant analysis**""" from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda=QuadraticDiscriminantAnalysis() qda.fit(x_train,y_train) from sklearn.model_selection import cross_val_score CV_scores_qda = cross_val_score(estimator = qda, X = x_train, y = y_train, cv = 10, scoring = 'accuracy') print("CV_scores: ", CV_scores_qda) plt.boxplot(CV_scores_qda) plt.title("10-fold cross validation accuracy") plt.xlabel("quadratic discriminant analysis") plt.ylabel("Accuracy") y_test_pred1 = qda.predict(x_test) print("Accuracy: ", metrics.accuracy_score(y_test_pred1,y_test)) print("Precision:",precision_score(y_test_pred1.astype(int), y_test.astype(int))) print("recall_score:",recall_score(y_test_pred1.astype(int), y_test.astype(int))) print("f1_score:",f1_score(y_test_pred1.astype(int), y_test.astype(int))) y_scores_qda = cross_val_predict(qda, x_test, y_test, cv=10, method="predict_proba") y_scores_qda_new=y_scores_qda[:,0] fpr, tpr, thresholds = roc_curve(y_test, y_scores_qda_new) plt.plot(fpr, tpr, linewidth=2, label='quadratic discriminative analysis') plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate -> (1-Specificity)') plt.ylabel('True Positive Rate -> (Recall)') plt.legend(loc='lower right') plt.show() """## **Task 9.3: Naive bayes model (optimal choice for text classification)**""" from sklearn.naive_bayes import MultinomialNB nbm=MultinomialNB() nbm.fit(x_train,y_train) from sklearn.model_selection import cross_val_score CV_scores_nbm = cross_val_score(estimator = nbm, X = x_train, y = y_train, cv = 10, scoring = 'accuracy') print("CV_scores: ", CV_scores_nbm) plt.boxplot(CV_scores_nbm) plt.title("10-fold cross validation accuracy") plt.xlabel("naive bayes analysis") plt.ylabel("Accuracy") y_test_pred2= nbm.predict(x_test) print("Accuracy: ", metrics.accuracy_score(y_test_pred2,y_test)) print("Precision:",precision_score(y_test_pred2.astype(int), y_test.astype(int))) print("recall_score:",recall_score(y_test_pred2.astype(int), y_test.astype(int))) print("f1_score:",f1_score(y_test_pred2.astype(int), y_test.astype(int))) y_scores_nbm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba") y_scores_nbm_new=y_scores_nbm[:,1] fpr, tpr, thresholds = roc_curve(y_test, y_scores_nbm_new) plt.plot(fpr, tpr, linewidth=2, label='naive bayes analysis') plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate -> (1-Specificity)') plt.ylabel('True Positive Rate -> (Recall)') plt.legend(loc='lower right') plt.show() """## **Task 9.4: Support Vector Machine**""" from sklearn.svm import SVC svm=SVC(probability=True) svm.fit(x_train,y_train) from sklearn.model_selection import cross_val_score CV_scores_svm = cross_val_score(estimator = svm, X = x_train, y = y_train, cv = 2, scoring = 'accuracy') print("CV_scores: ", CV_scores_svm) plt.boxplot(CV_scores_svm) plt.title("10-fold cross validation accuracy") plt.xlabel("Support Vector Machine") plt.ylabel("Accuracy") y_test_pred3= svm.predict(x_test) print("Accuracy: ", metrics.accuracy_score(y_test_pred3,y_test)) print("Precision:",precision_score(y_test_pred3.astype(int), y_test.astype(int))) print("recall_score:",recall_score(y_test_pred3.astype(int), y_test.astype(int))) print("f1_score:",f1_score(y_test_pred3.astype(int), y_test.astype(int))) y_scores_svm = cross_val_predict(nbm,x_test, y_test, cv=10, method="predict_proba") y_scores_svm_new=y_scores_nbm[:,1] fpr, tpr, thresholds = roc_curve(y_test, y_scores_svm_new) plt.plot(fpr, tpr, linewidth=2, label='Support vector machine') plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate -> (1-Specificity)') plt.ylabel('True Positive Rate -> (Recall)') plt.legend(loc='lower right') plt.show() """**Task 10: (Bonus 10 points) How to improve the classification accuracy?** from sklearn.preprocessing import StandardScaler standardscaler=StandardScaler() x_train_scale= standardscaler.fit_transform(x_train) x_test_scale= standardscaler.fit_transform(x_test) -->By using this we can improve accuracy. ## **Part II (20 points): Deploy the machine learning models on Gradio or huggingface** """ ##!pip install --quiet gradio from gradio.outputs import Label import gradio as gr ##import tensorflow as tf def caption(input_module,input_module1): class_a = ["Negative Comment", "Positive Comment"] input_mod=[input_module] input_module= vectorizer.transform(input_mod).toarray() if input_module1==("Logistic Regression"): output1=lr.predict(input_module) predictions=lr.predict_proba(input_module)[0] elif input_module1==("Linear discriminant analysis"): output1=clf.predict(input_module) predictions=clf.predict_proba(input_module)[0] elif input_module1==("Quadratic discriminant analysis"): output1=qda.predict(input_module) predictions=qda.predict_proba(input_module)[0] elif input_module1==("Naive Bayes classifier"): output1=nbm.predict(input_module) predictions=nbm.predict_proba(input_module)[0] elif input_module1==("Support Vector Machine"): output1=svm.predict(input_module) predictions=svm.predict_proba(input_module)[0] print(predictions.shape) output2={} if output1==0: output1="Negative comment" else: output1="Positive comment" for i in range(len(predictions)): output2[class_a[i]] = predictions[i] return output1,output2 input_module= gr.inputs.Textbox(label = "Review comment") input_module1= gr.inputs.Dropdown(choices=["Logistic Regression","Linear discriminant analysis", "Quadratic discriminant analysis","Naive Bayes classifier","Support Vecotr Machine"], label = "Method") output1 = gr.outputs.Textbox(label = "Predicted Class") output2=gr.outputs.Label(label= "probability of class") gr.Interface(fn=caption, inputs=[input_module,input_module1], outputs=[output1,output2]).launch(debug=True)