Spaces:

Pravincoder
/

Spam_Message_Detection_NLP

Sleeping

App Files Files Community

Spam_Message_Detection_NLP / app.py

Pravincoder

Update app.py

eb4b2f6 verified 10 months ago

raw

history blame

3.13 kB

	## Imports
	import tensorflow as tf
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import gradio

	## Load Data
	dataset = pd.read_csv('./SPAMtextmessage.csv')

	## Data Preprocessing
	# Convert ham to 0 and spam to 1
	dataset['Category']= dataset['Category'].str.replace('ham','0')
	dataset['Category']= dataset['Category'].str.replace('spam','1')
	dataset['Category']= dataset['Category'].astype(int)
	sentences = dataset['Message'].tolist()
	labels = dataset['Category'].tolist()
	# Separate out the sentences and labels into training and test sets
	training_size = int(len(sentences) * 0.8)
	# Sentence variables
	training_sentences = sentences[0:training_size]
	testing_sentences = sentences[training_size:]
	# Labels variables
	training_labels = labels[0:training_size]
	testing_labels = labels[training_size:]
	# Make labels into numpy arrays for use with the network later
	training_labels_final = np.array(training_labels)
	testing_labels_final = np.array(testing_labels)

	## Text Preprocessing
	vocab_size = 1000
	embedding_dim = 16
	max_length = 100
	trunc_type='post'
	padding_type='post'
	oov_tok = ""
	tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
	tokenizer.fit_on_texts(training_sentences)
	word_index = tokenizer.word_index
	sequences = tokenizer.texts_to_sequences(training_sentences)
	padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
	truncating=trunc_type)
	testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
	testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
	padding=padding_type, truncating=trunc_type)

	## Modeling
	# Set lr = 0.01
	model = tf.keras.Sequential([
	tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
	tf.keras.layers.Flatten(),
	tf.keras.layers.Dense(20,activation='relu'),
	tf.keras.layers.Dense(10,activation= 'relu'),
	tf.keras.layers.Dense(1,activation= 'sigmoid')
	])

	model.compile(loss='binary_crossentropy',metrics=['accuracy'],
	optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))
	model.fit(padded,training_labels_final,batch_size=128,epochs=50,
	validation_data=(testing_padded,testing_labels_final))

	## Gradio App
	def spam_detection(message):
	# Preprocess the input message
	sequence = tokenizer.texts_to_sequences([message])
	padded_sequence = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

	# Make prediction
	prediction = model.predict(padded_sequence)[0, 0]

	# Return the result
	return "Spam" if prediction >= 0.5 else "Not Spam"

	# Gradio Interface
	iface = gr.Interface(
	fn=spam_detection,
	inputs=gr.Textbox(prompt="Enter a message:"),
	outputs="text",
	live=True,
	theme="huggingface",
	title="Spam Message Detection",
	description="A demo app for learning purposes. Detects spam messages with 98% accuracy based on the dataset."
	)

	# Launch the app
	iface.launch()