|
|
|
import tensorflow as tf |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
import gradio |
|
|
|
|
|
dataset = pd.read_csv('./SPAMtextmessage.csv') |
|
|
|
|
|
|
|
dataset['Category']= dataset['Category'].str.replace('ham','0') |
|
dataset['Category']= dataset['Category'].str.replace('spam','1') |
|
dataset['Category']= dataset['Category'].astype(int) |
|
sentences = dataset['Message'].tolist() |
|
labels = dataset['Category'].tolist() |
|
|
|
training_size = int(len(sentences) * 0.8) |
|
|
|
training_sentences = sentences[0:training_size] |
|
testing_sentences = sentences[training_size:] |
|
|
|
training_labels = labels[0:training_size] |
|
testing_labels = labels[training_size:] |
|
|
|
training_labels_final = np.array(training_labels) |
|
testing_labels_final = np.array(testing_labels) |
|
|
|
|
|
vocab_size = 1000 |
|
embedding_dim = 16 |
|
max_length = 100 |
|
trunc_type='post' |
|
padding_type='post' |
|
oov_tok = "" |
|
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) |
|
tokenizer.fit_on_texts(training_sentences) |
|
word_index = tokenizer.word_index |
|
sequences = tokenizer.texts_to_sequences(training_sentences) |
|
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type, |
|
truncating=trunc_type) |
|
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) |
|
testing_padded = pad_sequences(testing_sequences,maxlen=max_length, |
|
padding=padding_type, truncating=trunc_type) |
|
|
|
|
|
|
|
model = tf.keras.Sequential([ |
|
tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length), |
|
tf.keras.layers.Flatten(), |
|
tf.keras.layers.Dense(20,activation='relu'), |
|
tf.keras.layers.Dense(10,activation= 'relu'), |
|
tf.keras.layers.Dense(1,activation= 'sigmoid') |
|
]) |
|
|
|
model.compile(loss='binary_crossentropy',metrics=['accuracy'], |
|
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)) |
|
model.fit(padded,training_labels_final,batch_size=128,epochs=50, |
|
validation_data=(testing_padded,testing_labels_final)) |
|
|
|
|
|
def spam_detection(message): |
|
|
|
sequence = tokenizer.texts_to_sequences([message]) |
|
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type) |
|
|
|
|
|
prediction = model.predict(padded_sequence)[0, 0] |
|
|
|
|
|
return "Spam" if prediction >= 0.5 else "Not Spam" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=spam_detection, |
|
inputs=gr.Textbox(prompt="Enter a message:"), |
|
outputs="text", |
|
live=True, |
|
theme="huggingface", |
|
title="Spam Message Detection", |
|
description="A demo app for learning purposes. Detects spam messages with 98% accuracy based on the dataset." |
|
) |
|
|
|
|
|
iface.launch() |