Spaces:

SandraPK
/

Classification-App

Sleeping

App Files Files Community

Classification-App / Spam_dnn.py

SandraPK

Upload 24 files

a4a5dbc about 1 year ago

raw

history blame

3.03 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
	import tensorflow as tf
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Embedding, Flatten, Dense
	import pickle

	# Load the dataset
	txt_file_path = 'SMSSpamCollection.txt'

	# Initialize empty lists to store labels and messages
	labels = []
	messages = []

	# Read the text file line by line and extract labels and messages
	try:
	with open(txt_file_path, 'r', encoding='utf-8') as file:
	for line in file:
	parts = line.strip().split('\t')
	if len(parts) == 2:
	label, message = parts
	labels.append(label)
	messages.append(message)

	# Create a DataFrame from the lists
	dataset = pd.DataFrame({'label': labels, 'message': messages})
	# Print the first few rows of the dataframe to check if data is loaded successfully
	print(dataset.head())
	except Exception as e:
	print(f"Error reading text file: {e}")
	# Assuming your dataset has 'label' and 'message' columns
	X = dataset['message'].values
	y = dataset['label'].map({'spam': 1, 'ham': 0}).values

	# Split the dataset into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Tokenize the text data
	max_words = 10000
	tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
	tokenizer.fit_on_texts(X_train)
	sequences_train = tokenizer.texts_to_sequences(X_train)
	sequences_test = tokenizer.texts_to_sequences(X_test)

	# Pad sequences to a fixed length
	max_sequence_length = 200
	X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post')
	X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')

	# Build the DNN model
	model = Sequential()
	model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_sequence_length))
	model.add(Flatten())
	model.add(Dense(64, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))

	# Compile the model
	model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

	# Train the model
	model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

	# Evaluate the model on the test set
	y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

	# Print classification report and accuracy
	print("Classification Report:")
	print(classification_report(y_test, y_pred))
	print("Confusion Matrix:")
	print(confusion_matrix(y_test, y_pred))
	print("Accuracy:", accuracy_score(y_test, y_pred))

	# Save the model
	model.save('spam_dnn_model.h5')

	# Save the tokenizer
	with open('tokenizer_dnn.pkl', 'wb') as tokenizer_file:
	tokenizer.word_index = {e: i for e, i in tokenizer.word_index.items() if i <= max_words}
	pickle.dump(tokenizer, tokenizer_file)