{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Importing Essential libraries : \n", "import pandas as pd\n", "# import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", "
" ], "text/plain": [ " Category Message\n", "0 ham Go until jurong point, crazy.. Available only ...\n", "1 ham Ok lar... Joking wif u oni...\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", "3 ham U dun say so early hor... U c already then say...\n", "4 ham Nah I don't think he goes to usf, he lives aro..." ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# importing data to work on :\n", "dataset = pd.read_csv(\"spam.csv\")\n", "dataset.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5572, 2)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Category', 'Message'], dtype='object')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.columns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5572 entries, 0 to 5571\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Category 5572 non-null object\n", " 1 Message 5572 non-null object\n", "dtypes: object(2)\n", "memory usage: 87.2+ KB\n", "None\n" ] }, { "data": { "text/plain": [ "Category 0\n", "Message 0\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to check if there are NULL values in our dataset :\n", "print(dataset.info())\n", "dataset.isna().sum()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Category 2\n", "Message 5157\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to check if there are values other than spam and ham :\n", "dataset.nunique()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0hamGo until jurong point, crazy.. Available only ...0
1hamOk lar... Joking wif u oni...0
2spamFree entry in 2 a wkly comp to win FA Cup fina...1
3hamU dun say so early hor... U c already then say...0
4hamNah I don't think he goes to usf, he lives aro...0
\n", "
" ], "text/plain": [ " Category Message Spam\n", "0 ham Go until jurong point, crazy.. Available only ... 0\n", "1 ham Ok lar... Joking wif u oni... 0\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n", "3 ham U dun say so early hor... U c already then say... 0\n", "4 ham Nah I don't think he goes to usf, he lives aro... 0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Encoding Categories into 0 and 1 :\n", "dataset[\"Spam\"] = [1 if i==\"spam\" else 0 for i in dataset[\"Category\"]]\n", "dataset.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "X = dataset[\"Message\"]\n", "y = dataset.Spam" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Train-Test Split :" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((4457,), (1115,), (4457,), (1115,))" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Importing CountVectorizer which converta the text into matrics :\n", "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Importing Different classifiers to compare :\n", "# from sklearn.linear_model import LogisticRegression\n", "# from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.naive_bayes import MultinomialNB # ✔️✔️ Works well with this type of problems, when data is discrete." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Creating a pipeline :\n", "\n", "from sklearn.pipeline import Pipeline\n", "clf=Pipeline([\n", " ('vectorizer',CountVectorizer()),\n", " ('nb',MultinomialNB())\n", "])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])
" ], "text/plain": [ "Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fitting Data :\n", "\n", "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.97847533632287" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Accuracy check :\n", "clf.score(X_test,y_test)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### *TESTING :*" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spam!\n", "Good to go 👍\n", "Good to go 👍\n", "Spam!\n", "Spam!\n", "Good to go 👍\n" ] } ], "source": [ "msg = [\"Thanks for your subscription to Ringtone - 'Shila ki jawaani', your mobile will be charged RS.5/month Please confirm by replying YES or NO. If you reply NO you will not be charged\",\n", "\"Oops, I'll let you know when my roommate's done\",\n", "\"hello, i am akshat, are you free today?\",\n", "\"free free free, get free coins, just download this xyz app (100 RS. Instant Cash)\",\n", "\"subscribe to get unlimited benefits\",\n", "\" i want some money, can you plz send me? \"]\n", "\n", "# True Values : 1 0 0 1 1 0\n", "# i.e. - Spam, Ham, Ham, Spam, Spam, Ham\n", "\n", "y_pred = clf.predict(msg) \n", "for i in y_pred:\n", " if i==0:\n", " print(\"Good to go 👍\")\n", " else:\n", " print(\"Spam!\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "#### *Saving this as a model using Joblib :*" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# from joblib import dump\n", "# dump(clf, 'Classifier.joblib')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6 (tags/v3.10.6:9c7b4bd, Aug 1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "706654849fe4d07e215a38f448ee8e5d780794e2be3793e11d37ab3169b306ae" } } }, "nbformat": 4, "nbformat_minor": 2 }