{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Importing Libraries : \n", "import pandas as pd\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CategoryMessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", "
" ], "text/plain": [ " Category Message\n", "0 ham Go until jurong point, crazy.. Available only ...\n", "1 ham Ok lar... Joking wif u oni...\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", "3 ham U dun say so early hor... U c already then say...\n", "4 ham Nah I don't think he goes to usf, he lives aro..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# importing data to work on :\n", "dataset = pd.read_csv(\"data/spam.csv\")\n", "dataset.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5572, 2)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Category', 'Message'], dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.columns" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5572 entries, 0 to 5571\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Category 5572 non-null object\n", " 1 Message 5572 non-null object\n", "dtypes: object(2)\n", "memory usage: 87.2+ KB\n", "None\n" ] }, { "data": { "text/plain": [ "Category 0\n", "Message 0\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to check if there are NULL values in our dataset :\n", "print(dataset.info())\n", "dataset.isna().sum()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Category 2\n", "Message 5157\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to check if there are values other than spam and ham :\n", "dataset.nunique()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CategoryMessageSpam
0hamGo until jurong point, crazy.. Available only ...0
1hamOk lar... Joking wif u oni...0
2spamFree entry in 2 a wkly comp to win FA Cup fina...1
3hamU dun say so early hor... U c already then say...0
4hamNah I don't think he goes to usf, he lives aro...0
\n", "
" ], "text/plain": [ " Category Message Spam\n", "0 ham Go until jurong point, crazy.. Available only ... 0\n", "1 ham Ok lar... Joking wif u oni... 0\n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina... 1\n", "3 ham U dun say so early hor... U c already then say... 0\n", "4 ham Nah I don't think he goes to usf, he lives aro... 0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Encoding Categories into 0 and 1 :\n", "dataset[\"Spam\"] = [1 if i==\"spam\" else 0 for i in dataset[\"Category\"]]\n", "dataset.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "X = dataset[\"Message\"]\n", "y = dataset.Spam" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Train-Test Split :" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((4457,), (1115,), (4457,), (1115,))" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Importing CountVectorizer which converta the text into matrics :\n", "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Importing Different classifiers to compare :\n", "# from sklearn.linear_model import LogisticRegression\n", "# from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.naive_bayes import MultinomialNB # ✔️✔️ Works well with this type of problems, i.e. when data is discrete." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Creating a pipeline :\n", "\n", "from sklearn.pipeline import Pipeline\n", "clf=Pipeline([\n", " ('vectorizer',CountVectorizer()),\n", " ('nb',MultinomialNB())\n", "])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fitting Data :\n", "\n", "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.97847533632287" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Accuracy check :\n", "clf.score(X_test,y_test)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### *TESTING :*" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spam!\n", "Good to go 👍\n", "Good to go 👍\n", "Spam!\n", "Spam!\n", "Good to go 👍\n" ] } ], "source": [ "msg = [\"Thanks for your subscription to Ringtone - 'Shila ki jawaani', your mobile will be charged RS.5/month Please confirm by replying YES or NO. If you reply NO you will not be charged\",\n", "\"Oops, I'll let you know when my roommate's done\",\n", "\"hello, i am akshat, are you free today?\",\n", "\"free free free, get free coins, just download this xyz app (100 RS. Instant Cash)\",\n", "\"subscribe to get unlimited benefits\",\n", "\" i want some money, can you plz send me? \"]\n", "\n", "# True Values : 1 0 0 1 1 0\n", "# i.e. - Spam, Ham, Ham, Spam, Spam, Ham\n", "\n", "y_pred = clf.predict(msg) \n", "for i in y_pred:\n", " if i==0:\n", " print(\"Good to go 👍\")\n", " else:\n", " print(\"Spam!\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "#### *Saving the model using `Pickle` :*" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# with open(\"models/spam-clf.pkl\", \"wb\") as f:\n", "# pickle.dump(clf, f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "706654849fe4d07e215a38f448ee8e5d780794e2be3793e11d37ab3169b306ae" } } }, "nbformat": 4, "nbformat_minor": 2 }