In [54]:
#import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,KFold,cross_val_score, ShuffleSplit 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,accuracy_score,classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
#read in file
df = pd.read_csv('sms_spam.csv')
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [56]:
# group by type of text/sms
df.groupby('type').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4827,4518,"Sorry, I'll call later",30
spam,747,642,Please call our customer service representativ...,4


In [57]:
#creating a new column named spam that classifies texts into spam or no spam messages/sms
# using the lambda function
df['spam'] = df['type'].apply(lambda x:1 if x == 'spam' else 0)
df.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [58]:
#using the train test split to split our datasets in the ratio 75:25 or 3:1
x_train,x_test,y_train,y_test = train_test_split(df.text,df.spam,test_size=0.25)

In [59]:
# Taking care of our text data by calling the count_vectorizer on them to change into a numerical data
# that the model will understand.
count = CountVectorizer()
x_train_count = count.fit_transform(x_train.values)
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [60]:
# Making use of the MultiNomial Naive Bayes model
model = LogisticRegression()
model.fit(x_train_count,y_train)

LogisticRegression()

In [61]:
# Testing out our model's accuracy
x_test_pred = count.transform(x_test)
accuracy_score(model.predict(x_test_pred),y_test)

0.9849354375896701

In [62]:
# Classification report
print(f"classification report : {classification_report(model.predict(x_test_pred),y_test)}")

classification report :               precision    recall  f1-score   support

           0       1.00      0.98      0.99      1212
           1       0.90      0.99      0.95       182

    accuracy                           0.98      1394
   macro avg       0.95      0.99      0.97      1394
weighted avg       0.99      0.98      0.99      1394



In [63]:
# Using the pipeline
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',LogisticRegression())
])


In [64]:
# fit our model
clf.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('nb', LogisticRegression())])

In [65]:
# Score our model
clf.score(x_test,y_test)

0.9849354375896701

In [66]:
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state=0)
cross_val_score(MultinomialNB(),x_train_count,y_train, cv=cv)

array([0.97607656, 0.9784689 , 0.97727273, 0.98684211, 0.98325359])

In [67]:
# Saving our model as a pickle file
import pickle
with open("model_log.pkl", "wb") as f:
    pickle.dump(model, f)

with open("model_log.pkl", "rb") as f:
    model = pickle.load(f)
    

# Saving our vectorizer
with open("vectorizer.pkl", "wb") as vect:
    pickle.dump(count, vect)

In [68]:
s  = ["FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
    , "Nah I don't think he goes to usf, he lives around here though","Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
     "URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"]
test = count.transform(s).toarray()
model.predict(test)

array([1, 0, 1, 1])