Upload 2 files
Browse files- bag-of-words.py +58 -0
- spam_classifier_pipeline.joblib +3 -0
bag-of-words.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import string
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
from sklearn.pipeline import Pipeline
|
5 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
6 |
+
from sklearn.naive_bayes import MultinomialNB
|
7 |
+
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
|
8 |
+
from joblib import dump
|
9 |
+
import nltk
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.stem import PorterStemmer
|
12 |
+
|
13 |
+
# Download necessary NLTK resources
|
14 |
+
nltk.download('stopwords')
|
15 |
+
|
16 |
+
# Initialize stopwords and stemmer
|
17 |
+
stop_words = set(stopwords.words('english'))
|
18 |
+
stemmer = PorterStemmer()
|
19 |
+
|
20 |
+
def preprocess(text):
|
21 |
+
"""Clean and preprocess text for model input."""
|
22 |
+
# Lowercase and remove punctuation
|
23 |
+
text = text.lower()
|
24 |
+
text = ''.join([char for char in text if char not in string.punctuation])
|
25 |
+
# Remove stopwords and apply stemming
|
26 |
+
words = text.split()
|
27 |
+
return ' '.join([stemmer.stem(word) for word in words if word not in stop_words])
|
28 |
+
|
29 |
+
# Load your dataset
|
30 |
+
data = pd.read_csv('spam.csv', encoding='latin-1')
|
31 |
+
data = data[['v1', 'v2']]
|
32 |
+
data.columns = ['label', 'message']
|
33 |
+
|
34 |
+
# Convert labels to binary
|
35 |
+
data['label'] = data['label'].map({'ham': 'LABEL_0', 'spam': 'LABEL_1'})
|
36 |
+
|
37 |
+
# Split data into train and test sets
|
38 |
+
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)
|
39 |
+
|
40 |
+
# Create a pipeline that includes preprocessing, vectorization, and classification
|
41 |
+
model_pipeline = Pipeline([
|
42 |
+
('vectorizer', CountVectorizer(preprocessor=preprocess)),
|
43 |
+
('classifier', MultinomialNB())
|
44 |
+
])
|
45 |
+
|
46 |
+
# Train the model using the pipeline
|
47 |
+
model_pipeline.fit(X_train, y_train)
|
48 |
+
|
49 |
+
# Evaluate the model
|
50 |
+
y_pred = model_pipeline.predict(X_test)
|
51 |
+
print('Accuracy:', accuracy_score(y_test, y_pred))
|
52 |
+
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
|
53 |
+
print('Classification Report:\n', classification_report(y_test, y_pred))
|
54 |
+
|
55 |
+
# Serialize the model pipeline
|
56 |
+
dump(model_pipeline, 'spam_classifier_pipeline.joblib')
|
57 |
+
|
58 |
+
|
spam_classifier_pipeline.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a1a49cb5a900390a985b97c3c28c66fa5b4d20afcc9ee03bb9c361610146f62
|
3 |
+
size 314730
|