Spaces:

SandraPK
/

Classification-App

Sleeping

App Files Files Community

Classification-App / spam_back.py

SandraPK

Upload 24 files

a4a5dbc about 1 year ago

raw

history blame

2.77 kB

	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report, accuracy_score
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from BackPropogation import BackPropogation
	from sklearn.preprocessing import LabelEncoder
	import pickle

	# Load the SMS Spam Collection dataset
	sms_dataset_path = 'SMSSpamCollection.txt'
	sms_data = []
	sms_labels = []

	with open(sms_dataset_path, 'r', encoding='utf-8') as file:
	for line in file:
	parts = line.strip().split('\t')
	if len(parts) == 2:
	label, message = parts
	sms_labels.append(label)
	sms_data.append(message)


	# Use LabelEncoder to encode 'spam' and 'ham' into numerical values
	label_encoder = LabelEncoder()
	sms_labels = label_encoder.fit_transform(sms_labels)

	# Assuming your Backpropagation class does not require input_size during initialization
	backpropagation = BackPropogation(learning_rate=0.01, epochs=5)

	# Split the dataset into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(sms_data, sms_labels, test_size=0.2, random_state=42)

	# Tokenize the text data
	max_words = 10000
	tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
	tokenizer.fit_on_texts(X_train)
	sequences_train = tokenizer.texts_to_sequences(X_train)
	sequences_test = tokenizer.texts_to_sequences(X_test)

	# Pad sequences to a fixed length
	max_sequence_length = 200
	X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post')
	X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')

	# Flatten the input sequences
	X_train_flatten = X_train_padded.reshape((X_train_padded.shape[0], -1))

	# Train the Backpropagation model
	backpropagation.fit(X_train_flatten, y_train)

	# Use the same tokenizer to transform the test data
	sequences_test = tokenizer.texts_to_sequences(X_test)
	X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')

	# Make predictions on the test set
	predictions = backpropagation.predict(X_test_padded)

	# Evaluate and print results
	print("Perceptron Classification Report:")
	print(classification_report(y_test, predictions))
	print("Perceptron Accuracy:", accuracy_score(y_test, predictions))


	# Save the trained Backpropagation model using pickle
	backpropagation_model_path = 'spam_backpropagation_model.pkl'
	with open(backpropagation_model_path, 'wb') as model_file:
	pickle.dump(backpropagation, model_file)

	# Save the tokenizer using pickle
	tokenizer_path = 'tokenizer_backpropagation.pkl'
	with open(tokenizer_path, 'wb') as tokenizer_file:
	pickle.dump(tokenizer, tokenizer_file)