{ "cells": [ { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "#import necessary libraries\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split,KFold,cross_val_score, ShuffleSplit \n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import f1_score,accuracy_score,classification_report\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typetext
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", "
" ], "text/plain": [ " type text\n", "0 ham Go until jurong point, crazy.. Available only ...\n", "1 ham Ok lar... Joking wif u oni...\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", "3 ham U dun say so early hor... U c already then say...\n", "4 ham Nah I don't think he goes to usf, he lives aro..." ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read in file\n", "df = pd.read_csv('sms_spam.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text
countuniquetopfreq
type
ham48274518Sorry, I'll call later30
spam747642Please call our customer service representativ...4
\n", "
" ], "text/plain": [ " text \n", " count unique top freq\n", "type \n", "ham 4827 4518 Sorry, I'll call later 30\n", "spam 747 642 Please call our customer service representativ... 4" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# group by type of text/sms\n", "df.groupby('type').describe()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typetextspam
0hamGo until jurong point, crazy.. Available only ...0
1hamOk lar... Joking wif u oni...0
2spamFree entry in 2 a wkly comp to win FA Cup fina...1
3hamU dun say so early hor... U c already then say...0
4hamNah I don't think he goes to usf, he lives aro...0
\n", "
" ], "text/plain": [ " type text spam\n", "0 ham Go until jurong point, crazy.. Available only ... 0\n", "1 ham Ok lar... Joking wif u oni... 0\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n", "3 ham U dun say so early hor... U c already then say... 0\n", "4 ham Nah I don't think he goes to usf, he lives aro... 0" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#creating a new column named spam that classifies texts into spam or no spam messages/sms\n", "# using the lambda function\n", "df['spam'] = df['type'].apply(lambda x:1 if x == 'spam' else 0)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "#using the train test split to split our datasets in the ratio 75:25 or 3:1\n", "x_train,x_test,y_train,y_test = train_test_split(df.text,df.spam,test_size=0.25)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0]])" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Taking care of our text data by calling the count_vectorizer on them to change into a numerical data\n", "# that the model will understand.\n", "count = CountVectorizer()\n", "x_train_count = count.fit_transform(x_train.values)\n", "x_train_count.toarray()[:3]" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LogisticRegression()" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Making use of the MultiNomial Naive Bayes model\n", "model = LogisticRegression()\n", "model.fit(x_train_count,y_train)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9849354375896701" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Testing out our model's accuracy\n", "x_test_pred = count.transform(x_test)\n", "accuracy_score(model.predict(x_test_pred),y_test)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "classification report : precision recall f1-score support\n", "\n", " 0 1.00 0.98 0.99 1212\n", " 1 0.90 0.99 0.95 182\n", "\n", " accuracy 0.98 1394\n", " macro avg 0.95 0.99 0.97 1394\n", "weighted avg 0.99 0.98 0.99 1394\n", "\n" ] } ], "source": [ "# Classification report\n", "print(f\"classification report : {classification_report(model.predict(x_test_pred),y_test)}\")" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "# Using the pipeline\n", "clf = Pipeline([\n", " ('vectorizer',CountVectorizer()),\n", " ('nb',LogisticRegression())\n", "])\n" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('vectorizer', CountVectorizer()),\n", " ('nb', LogisticRegression())])" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fit our model\n", "clf.fit(x_train,y_train)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9849354375896701" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Score our model\n", "clf.score(x_test,y_test)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.97607656, 0.9784689 , 0.97727273, 0.98684211, 0.98325359])" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state=0)\n", "cross_val_score(MultinomialNB(),x_train_count,y_train, cv=cv)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "# Saving our model as a pickle file\n", "import pickle\n", "with open(\"model_log.pkl\", \"wb\") as f:\n", " pickle.dump(model, f)\n", "\n", "with open(\"model_log.pkl\", \"rb\") as f:\n", " model = pickle.load(f)\n", " \n", "\n", "# Saving our vectorizer\n", "with open(\"vectorizer.pkl\", \"wb\") as vect:\n", " pickle.dump(count, vect)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 1, 1])" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = [\"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv\"\n", " , \"Nah I don't think he goes to usf, he lives around here though\",\"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\",\n", " \"URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18\"]\n", "test = count.transform(s).toarray()\n", "model.predict(test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }