Spaces:
Sleeping
Sleeping
File size: 2,768 Bytes
a4a5dbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from BackPropogation import BackPropogation
from sklearn.preprocessing import LabelEncoder
import pickle
# Load the SMS Spam Collection dataset
sms_dataset_path = 'SMSSpamCollection.txt'
sms_data = []
sms_labels = []
with open(sms_dataset_path, 'r', encoding='utf-8') as file:
for line in file:
parts = line.strip().split('\t')
if len(parts) == 2:
label, message = parts
sms_labels.append(label)
sms_data.append(message)
# Use LabelEncoder to encode 'spam' and 'ham' into numerical values
label_encoder = LabelEncoder()
sms_labels = label_encoder.fit_transform(sms_labels)
# Assuming your Backpropagation class does not require input_size during initialization
backpropagation = BackPropogation(learning_rate=0.01, epochs=5)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sms_data, sms_labels, test_size=0.2, random_state=42)
# Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
# Pad sequences to a fixed length
max_sequence_length = 200
X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')
# Flatten the input sequences
X_train_flatten = X_train_padded.reshape((X_train_padded.shape[0], -1))
# Train the Backpropagation model
backpropagation.fit(X_train_flatten, y_train)
# Use the same tokenizer to transform the test data
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')
# Make predictions on the test set
predictions = backpropagation.predict(X_test_padded)
# Evaluate and print results
print("Perceptron Classification Report:")
print(classification_report(y_test, predictions))
print("Perceptron Accuracy:", accuracy_score(y_test, predictions))
# Save the trained Backpropagation model using pickle
backpropagation_model_path = 'spam_backpropagation_model.pkl'
with open(backpropagation_model_path, 'wb') as model_file:
pickle.dump(backpropagation, model_file)
# Save the tokenizer using pickle
tokenizer_path = 'tokenizer_backpropagation.pkl'
with open(tokenizer_path, 'wb') as tokenizer_file:
pickle.dump(tokenizer, tokenizer_file) |