Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score | |
import tensorflow as tf | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.layers import Embedding, Flatten, Dense | |
import pickle | |
# Load the dataset | |
txt_file_path = 'SMSSpamCollection.txt' | |
# Initialize empty lists to store labels and messages | |
labels = [] | |
messages = [] | |
# Read the text file line by line and extract labels and messages | |
try: | |
with open(txt_file_path, 'r', encoding='utf-8') as file: | |
for line in file: | |
parts = line.strip().split('\t') | |
if len(parts) == 2: | |
label, message = parts | |
labels.append(label) | |
messages.append(message) | |
# Create a DataFrame from the lists | |
dataset = pd.DataFrame({'label': labels, 'message': messages}) | |
# Print the first few rows of the dataframe to check if data is loaded successfully | |
print(dataset.head()) | |
except Exception as e: | |
print(f"Error reading text file: {e}") | |
# Assuming your dataset has 'label' and 'message' columns | |
X = dataset['message'].values | |
y = dataset['label'].map({'spam': 1, 'ham': 0}).values | |
# Split the dataset into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Tokenize the text data | |
max_words = 10000 | |
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>') | |
tokenizer.fit_on_texts(X_train) | |
sequences_train = tokenizer.texts_to_sequences(X_train) | |
sequences_test = tokenizer.texts_to_sequences(X_test) | |
# Pad sequences to a fixed length | |
max_sequence_length = 200 | |
X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post') | |
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post') | |
# Build the DNN model | |
model = Sequential() | |
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_sequence_length)) | |
model.add(Flatten()) | |
model.add(Dense(64, activation='relu')) | |
model.add(Dense(1, activation='sigmoid')) | |
# Compile the model | |
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | |
# Train the model | |
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2) | |
# Evaluate the model on the test set | |
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32") | |
# Print classification report and accuracy | |
print("Classification Report:") | |
print(classification_report(y_test, y_pred)) | |
print("Confusion Matrix:") | |
print(confusion_matrix(y_test, y_pred)) | |
print("Accuracy:", accuracy_score(y_test, y_pred)) | |
# Save the model | |
model.save('spam_dnn_model.h5') | |
# Save the tokenizer | |
with open('tokenizer_dnn.pkl', 'wb') as tokenizer_file: | |
tokenizer.word_index = {e: i for e, i in tokenizer.word_index.items() if i <= max_words} | |
pickle.dump(tokenizer, tokenizer_file) | |