MainaMan
/

bag-of-words_spam_classifier

Model card Files Files and versions Community

MainaMan commited on May 21

Commit

67cb272

•

1 Parent(s): ee09b91

Upload 2 files

Files changed (2) hide show

bag-of-words.py +58 -0
spam_classifier_pipeline.joblib +3 -0

bag-of-words.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+import string
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+from joblib import dump
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+# Download necessary NLTK resources
+nltk.download('stopwords')
+# Initialize stopwords and stemmer
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+def preprocess(text):
+    """Clean and preprocess text for model input."""
+    # Lowercase and remove punctuation
+    text = text.lower()
+    text = ''.join([char for char in text if char not in string.punctuation])
+    # Remove stopwords and apply stemming
+    words = text.split()
+    return ' '.join([stemmer.stem(word) for word in words if word not in stop_words])
+# Load your dataset
+data = pd.read_csv('spam.csv', encoding='latin-1')
+data = data[['v1', 'v2']]
+data.columns = ['label', 'message']
+# Convert labels to binary
+data['label'] = data['label'].map({'ham': 'LABEL_0', 'spam': 'LABEL_1'})
+# Split data into train and test sets
+X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)
+# Create a pipeline that includes preprocessing, vectorization, and classification
+model_pipeline = Pipeline([
+    ('vectorizer', CountVectorizer(preprocessor=preprocess)),
+    ('classifier', MultinomialNB())
+])
+# Train the model using the pipeline
+model_pipeline.fit(X_train, y_train)
+# Evaluate the model
+y_pred = model_pipeline.predict(X_test)
+print('Accuracy:', accuracy_score(y_test, y_pred))
+print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
+print('Classification Report:\n', classification_report(y_test, y_pred))
+# Serialize the model pipeline
+dump(model_pipeline, 'spam_classifier_pipeline.joblib')

spam_classifier_pipeline.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a1a49cb5a900390a985b97c3c28c66fa5b4d20afcc9ee03bb9c361610146f62
+size 314730