Spaces:

Taoheed-O
/

spam_detector_app

Runtime error

App Files Files Community

Taoheed-O commited on Sep 8, 2022

Commit

17effdf

•

1 Parent(s): 8ae168d

hosting spam detector app with hugging face

Browse files

Files changed (6) hide show

main.py +86 -0
model_log.pkl +0 -0
requirements.txt +4 -0
sms_spam.csv +0 -0
spam_detector.ipynb +528 -0
vectorizer.pkl +0 -0

main.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import pickle
+import streamlit as st
+# loading in the model to predict on the data
+vectorizer_in = open('vectorizer.pkl', 'rb')
+vectorizer = pickle.load(vectorizer_in)
+pickle_in = open("model_log.pkl", "rb")
+classifier = pickle.load(pickle_in)
+# Image
+st.image("https://media.istockphoto.com/photos/phishing-scam-email-identity-alert-3d-rendering-picture-id1046171248")
+def welcome():
+	return 'welcome all'
+# defining the function which will make the prediction using
+# the data(text) which the user inputs
+def prediction(text):
+    vector_text = vectorizer.transform([text]).toarray()
+    prediction = classifier.predict(vector_text)
+    print(prediction)
+    return(prediction)
+# this is the main function in which is defined on the webpage
+def main():
+	# giving the webpage a title
+	st.title("Spam E-mail Detector")
+	# the font and background color, the padding and the text to be displayed
+	html_temp = """
+	<div style ="background-color:black;padding:13px">
+	<h1 style ="color:white;text-align:center;">Spam Detector App</h1>
+	</div>
+	"""
+	# this line allows us to display the front end aspects we have
+	# defined in the above code
+	st.markdown(html_temp, unsafe_allow_html = True)
+	#List of available models
+	options = st.radio("Available Models:", ["Logistic Regression", "Multinomial Naive Bayes","Decision Tree"])
+	result =""
+	# the below line ensures that when the button called 'Predict' is clicked,
+	# the prediction function defined above is called to make the prediction
+	# and store it in the variable result
+	if options == "Logistic Regression":
+		st.success("You picked {}".format(options))
+		# the following lines create text boxes in which the user can enter
+		# the data required to make the prediction
+		text = st.text_input("Review:", "Type your review here")
+		if st.button('Predict'):
+			result = prediction(text)
+			if result == 0:
+				st.error('This is not a spam mail/sms.'.format(result))
+			else:
+				st.success('This is a spam mail/sms.'.format(result))
+	else:
+		st.warning('This model is under development and not available for predicting yet.'.format(result))
+		pass
+	html_git = """
+	<h3>Checkout my GitHub</h3>
+	<div style ="background-color:black;padding:13px">
+	<h1 style ="color:white;text-align:center;"><a href="https://github.com/Taoheed-O"> My GitHub link</h1>
+	</div>
+	"""
+	html_linkedIn = """
+	<h3>Connect with me on LinkedIn</h3>
+	<div style ="background-color:black;padding:13px">
+	<h1 style ="color:white;text-align:center;"><a href="https://www.linkedin.com/in/taoheed-oyeniyi"> My LinkedIn</h1>
+	</div>
+	"""
+	# this line allows us to display the front end aspects we have
+	# defined in the above code
+	st.markdown(html_git, unsafe_allow_html = True)
+	st.markdown(html_linkedIn, unsafe_allow_html = True)
+if __name__=='__main__':
+	main()

model_log.pkl ADDED Viewed

Binary file (61 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+numpy
+pandas
+sklearn
+streamlit

sms_spam.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

spam_detector.ipynb ADDED Viewed

	@@ -0,0 +1,528 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#import necessary libraries\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split,KFold,cross_val_score, ShuffleSplit \n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import f1_score,accuracy_score,classification_report\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.feature_extraction.text import CountVectorizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>type</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Go until jurong point, crazy.. Available only ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Ok lar... Joking wif u oni...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>spam</td>\n",
+       "      <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>U dun say so early hor... U c already then say...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   type                                               text\n",
+       "0   ham  Go until jurong point, crazy.. Available only ...\n",
+       "1   ham                      Ok lar... Joking wif u oni...\n",
+       "2  spam  Free entry in 2 a wkly comp to win FA Cup fina...\n",
+       "3   ham  U dun say so early hor... U c already then say...\n",
+       "4   ham  Nah I don't think he goes to usf, he lives aro..."
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#read in file\n",
+    "df = pd.read_csv('sms_spam.csv')\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"4\" halign=\"left\">text</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>unique</th>\n",
+       "      <th>top</th>\n",
+       "      <th>freq</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>type</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>ham</th>\n",
+       "      <td>4827</td>\n",
+       "      <td>4518</td>\n",
+       "      <td>Sorry, I'll call later</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>spam</th>\n",
+       "      <td>747</td>\n",
+       "      <td>642</td>\n",
+       "      <td>Please call our customer service representativ...</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      text                                                               \n",
+       "     count unique                                                top freq\n",
+       "type                                                                     \n",
+       "ham   4827   4518                             Sorry, I'll call later   30\n",
+       "spam   747    642  Please call our customer service representativ...    4"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# group by type of text/sms\n",
+    "df.groupby('type').describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>type</th>\n",
+       "      <th>text</th>\n",
+       "      <th>spam</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Go until jurong point, crazy.. Available only ...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Ok lar... Joking wif u oni...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>spam</td>\n",
+       "      <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>U dun say so early hor... U c already then say...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   type                                               text  spam\n",
+       "0   ham  Go until jurong point, crazy.. Available only ...     0\n",
+       "1   ham                      Ok lar... Joking wif u oni...     0\n",
+       "2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     1\n",
+       "3   ham  U dun say so early hor... U c already then say...     0\n",
+       "4   ham  Nah I don't think he goes to usf, he lives aro...     0"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#creating a new column named spam that classifies texts into spam or no spam messages/sms\n",
+    "# using the lambda function\n",
+    "df['spam'] = df['type'].apply(lambda x:1 if x == 'spam' else 0)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#using the train test split to split our datasets in the ratio 75:25 or 3:1\n",
+    "x_train,x_test,y_train,y_test = train_test_split(df.text,df.spam,test_size=0.25)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, ..., 0, 0, 0],\n",
+       "       [0, 0, 0, ..., 0, 0, 0],\n",
+       "       [0, 0, 0, ..., 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Taking care of our text data by calling the count_vectorizer on them to change into a numerical data\n",
+    "# that the model will understand.\n",
+    "count = CountVectorizer()\n",
+    "x_train_count = count.fit_transform(x_train.values)\n",
+    "x_train_count.toarray()[:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LogisticRegression()"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Making use of the MultiNomial Naive Bayes model\n",
+    "model = LogisticRegression()\n",
+    "model.fit(x_train_count,y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9849354375896701"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Testing out our model's accuracy\n",
+    "x_test_pred = count.transform(x_test)\n",
+    "accuracy_score(model.predict(x_test_pred),y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "classification report :               precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      0.98      0.99      1212\n",
+      "           1       0.90      0.99      0.95       182\n",
+      "\n",
+      "    accuracy                           0.98      1394\n",
+      "   macro avg       0.95      0.99      0.97      1394\n",
+      "weighted avg       0.99      0.98      0.99      1394\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Classification report\n",
+    "print(f\"classification report : {classification_report(model.predict(x_test_pred),y_test)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Using the pipeline\n",
+    "clf = Pipeline([\n",
+    "    ('vectorizer',CountVectorizer()),\n",
+    "    ('nb',LogisticRegression())\n",
+    "])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('vectorizer', CountVectorizer()),\n",
+       "                ('nb', LogisticRegression())])"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# fit our model\n",
+    "clf.fit(x_train,y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9849354375896701"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Score our model\n",
+    "clf.score(x_test,y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.97607656, 0.9784689 , 0.97727273, 0.98684211, 0.98325359])"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state=0)\n",
+    "cross_val_score(MultinomialNB(),x_train_count,y_train, cv=cv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Saving our model as a pickle file\n",
+    "import pickle\n",
+    "with open(\"model_log.pkl\", \"wb\") as f:\n",
+    "    pickle.dump(model, f)\n",
+    "\n",
+    "with open(\"model_log.pkl\", \"rb\") as f:\n",
+    "    model = pickle.load(f)\n",
+    "    \n",
+    "\n",
+    "# Saving our vectorizer\n",
+    "with open(\"vectorizer.pkl\", \"wb\") as vect:\n",
+    "    pickle.dump(count, vect)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 0, 1, 1])"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s  = [\"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv\"\n",
+    "    , \"Nah I don't think he goes to usf, he lives around here though\",\"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\",\n",
+    "     \"URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18\"]\n",
+    "test = count.transform(s).toarray()\n",
+    "model.predict(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

vectorizer.pkl ADDED Viewed

Binary file (91 kB). View file