spam_filter / app.py
Jonathan Jimenez
Add application file
f32e250
import os
import codecs
import random
import nltk
from nltk import NaiveBayesClassifier, classify, word_tokenize
from nltk import Text
import gradio as gr
nltk.download("punkt")
def read_in(folder):
file_list = os.listdir(folder)
content_list = []
for doc in file_list:
if not doc.startswith("."):
doc_read = codecs.open(folder + doc, mode="r", encoding="ISO-8859-1", errors="ignore")
content_list.append(doc_read.read())
doc_read.close()
return content_list
ham_list = read_in("ham/")
spam_list = read_in("spam/")
all_emails = [(email, "ham") for email in ham_list]
all_emails += [(email, "spam") for email in spam_list]
random.seed(42)
random.shuffle(all_emails)
def get_features(content):
word_list = word_tokenize(content.lower())
features = {}
for word in word_list:
features[word] = True
return features
all_features = [(get_features(email), label) for (email, label) in all_emails]
def train(content, proportion):
sample_size = int(len(content) * proportion)
train_set = all_features[:sample_size]
test_set = all_features[sample_size:]
classifier = NaiveBayesClassifier.train(train_set)
return train_set, test_set, classifier
train_set, test_set, classifier = train(all_features, .80)
def evaluate(train_set, test_set, classifier):
print(f"Accuracy for train set: {classify.accuracy(classifier, train_set)}")
print(f"Accuracy for test set: {classify.accuracy(classifier, test_set)}")
NaiveBayesClassifier.show_most_informative_features(classifier)
# evaluate(train_set, test_set, classifier)
def concordance(data_list, search_word):
for data in data_list:
word_list = word_tokenize(data.lower())
text_list = Text(word_list)
if search_word in text_list:
text_list.concordance(search_word)
# print(f"stock in HAM")
# concordance(ham_list, "stock")
# print(f"stock in SPAM")
# concordance(spam_list, "stock")
# test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"]
# test_ham_list = ["See the minutes from the last meeting attached",
# "Investors are coming to our office on Monday"]
#
# test_all_emails = [(email, "spam") for email in test_spam_list]
# test_all_emails += [(email, "ham") for email in test_ham_list]
#
# set_test_email = [(get_features(email), label) for (email, label) in test_all_emails]
#
# evaluate(train_set, test_set, classifier)
def filter_email(email):
return classifier.classify(get_features(email))
with gr.Blocks() as demo:
gr.Markdown("""
# Spam filter
This spam filter will help you to know if your mail is a real one (ham) or spam.
""")
inp = gr.TextArea(max_lines=20, placeholder="Enter email to classify", label="Input")
out = gr.Label()
inp.change(fn=filter_email, inputs=inp, outputs=out)
gr.Examples(["Participate in our new lottery!", "Try out this new medicine"],
inp,
label="SPAM examples")
gr.Examples(["See the minutes from the last meeting attached", "Investors are coming to our office on Monday"],
inp,
label="HAM examples")
demo.launch()