import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Flatten, Dense import pickle # Load the dataset txt_file_path = 'SMSSpamCollection.txt' # Initialize empty lists to store labels and messages labels = [] messages = [] # Read the text file line by line and extract labels and messages try: with open(txt_file_path, 'r', encoding='utf-8') as file: for line in file: parts = line.strip().split('\t') if len(parts) == 2: label, message = parts labels.append(label) messages.append(message) # Create a DataFrame from the lists dataset = pd.DataFrame({'label': labels, 'message': messages}) # Print the first few rows of the dataframe to check if data is loaded successfully print(dataset.head()) except Exception as e: print(f"Error reading text file: {e}") # Assuming your dataset has 'label' and 'message' columns X = dataset['message'].values y = dataset['label'].map({'spam': 1, 'ham': 0}).values # Split the dataset into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Tokenize the text data max_words = 10000 tokenizer = Tokenizer(num_words=max_words, oov_token='') tokenizer.fit_on_texts(X_train) sequences_train = tokenizer.texts_to_sequences(X_train) sequences_test = tokenizer.texts_to_sequences(X_test) # Pad sequences to a fixed length max_sequence_length = 200 X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post') X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post') # Build the DNN model model = Sequential() model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_sequence_length)) model.add(Flatten()) model.add(Dense(64, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile the model model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Train the model model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2) # Evaluate the model on the test set y_pred = (model.predict(X_test_padded) > 0.5).astype("int32") # Print classification report and accuracy print("Classification Report:") print(classification_report(y_test, y_pred)) print("Confusion Matrix:") print(confusion_matrix(y_test, y_pred)) print("Accuracy:", accuracy_score(y_test, y_pred)) # Save the model model.save('spam_dnn_model.h5') # Save the tokenizer with open('tokenizer_dnn.pkl', 'wb') as tokenizer_file: tokenizer.word_index = {e: i for e, i in tokenizer.word_index.items() if i <= max_words} pickle.dump(tokenizer, tokenizer_file)