Spaces:
Sleeping
Sleeping
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import classification_report, accuracy_score | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from BackPropogation import BackPropogation | |
from sklearn.preprocessing import LabelEncoder | |
import pickle | |
# Load the SMS Spam Collection dataset | |
sms_dataset_path = 'SMSSpamCollection.txt' | |
sms_data = [] | |
sms_labels = [] | |
with open(sms_dataset_path, 'r', encoding='utf-8') as file: | |
for line in file: | |
parts = line.strip().split('\t') | |
if len(parts) == 2: | |
label, message = parts | |
sms_labels.append(label) | |
sms_data.append(message) | |
# Use LabelEncoder to encode 'spam' and 'ham' into numerical values | |
label_encoder = LabelEncoder() | |
sms_labels = label_encoder.fit_transform(sms_labels) | |
# Assuming your Backpropagation class does not require input_size during initialization | |
backpropagation = BackPropogation(learning_rate=0.01, epochs=5) | |
# Split the dataset into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(sms_data, sms_labels, test_size=0.2, random_state=42) | |
# Tokenize the text data | |
max_words = 10000 | |
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>') | |
tokenizer.fit_on_texts(X_train) | |
sequences_train = tokenizer.texts_to_sequences(X_train) | |
sequences_test = tokenizer.texts_to_sequences(X_test) | |
# Pad sequences to a fixed length | |
max_sequence_length = 200 | |
X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post') | |
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post') | |
# Flatten the input sequences | |
X_train_flatten = X_train_padded.reshape((X_train_padded.shape[0], -1)) | |
# Train the Backpropagation model | |
backpropagation.fit(X_train_flatten, y_train) | |
# Use the same tokenizer to transform the test data | |
sequences_test = tokenizer.texts_to_sequences(X_test) | |
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post') | |
# Make predictions on the test set | |
predictions = backpropagation.predict(X_test_padded) | |
# Evaluate and print results | |
print("Perceptron Classification Report:") | |
print(classification_report(y_test, predictions)) | |
print("Perceptron Accuracy:", accuracy_score(y_test, predictions)) | |
# Save the trained Backpropagation model using pickle | |
backpropagation_model_path = 'spam_backpropagation_model.pkl' | |
with open(backpropagation_model_path, 'wb') as model_file: | |
pickle.dump(backpropagation, model_file) | |
# Save the tokenizer using pickle | |
tokenizer_path = 'tokenizer_backpropagation.pkl' | |
with open(tokenizer_path, 'wb') as tokenizer_file: | |
pickle.dump(tokenizer, tokenizer_file) |