Spaces:
Sleeping
Sleeping
import os | |
import codecs | |
import random | |
import nltk | |
from nltk import NaiveBayesClassifier, classify, word_tokenize | |
from nltk import Text | |
import gradio as gr | |
nltk.download("punkt") | |
def read_in(folder): | |
file_list = os.listdir(folder) | |
content_list = [] | |
for doc in file_list: | |
if not doc.startswith("."): | |
doc_read = codecs.open(folder + doc, mode="r", encoding="ISO-8859-1", errors="ignore") | |
content_list.append(doc_read.read()) | |
doc_read.close() | |
return content_list | |
ham_list = read_in("ham/") | |
spam_list = read_in("spam/") | |
all_emails = [(email, "ham") for email in ham_list] | |
all_emails += [(email, "spam") for email in spam_list] | |
random.seed(42) | |
random.shuffle(all_emails) | |
def get_features(content): | |
word_list = word_tokenize(content.lower()) | |
features = {} | |
for word in word_list: | |
features[word] = True | |
return features | |
all_features = [(get_features(email), label) for (email, label) in all_emails] | |
def train(content, proportion): | |
sample_size = int(len(content) * proportion) | |
train_set = all_features[:sample_size] | |
test_set = all_features[sample_size:] | |
classifier = NaiveBayesClassifier.train(train_set) | |
return train_set, test_set, classifier | |
train_set, test_set, classifier = train(all_features, .80) | |
def evaluate(train_set, test_set, classifier): | |
print(f"Accuracy for train set: {classify.accuracy(classifier, train_set)}") | |
print(f"Accuracy for test set: {classify.accuracy(classifier, test_set)}") | |
NaiveBayesClassifier.show_most_informative_features(classifier) | |
# evaluate(train_set, test_set, classifier) | |
def concordance(data_list, search_word): | |
for data in data_list: | |
word_list = word_tokenize(data.lower()) | |
text_list = Text(word_list) | |
if search_word in text_list: | |
text_list.concordance(search_word) | |
# print(f"stock in HAM") | |
# concordance(ham_list, "stock") | |
# print(f"stock in SPAM") | |
# concordance(spam_list, "stock") | |
# test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"] | |
# test_ham_list = ["See the minutes from the last meeting attached", | |
# "Investors are coming to our office on Monday"] | |
# | |
# test_all_emails = [(email, "spam") for email in test_spam_list] | |
# test_all_emails += [(email, "ham") for email in test_ham_list] | |
# | |
# set_test_email = [(get_features(email), label) for (email, label) in test_all_emails] | |
# | |
# evaluate(train_set, test_set, classifier) | |
def filter_email(email): | |
return classifier.classify(get_features(email)) | |
with gr.Blocks() as demo: | |
gr.Markdown(""" | |
# Spam filter | |
This spam filter will help you to know if your mail is a real one (ham) or spam. | |
""") | |
inp = gr.TextArea(max_lines=20, placeholder="Enter email to classify", label="Input") | |
out = gr.Label() | |
inp.change(fn=filter_email, inputs=inp, outputs=out) | |
gr.Examples(["Participate in our new lottery!", "Try out this new medicine"], | |
inp, | |
label="SPAM examples") | |
gr.Examples(["See the minutes from the last meeting attached", "Investors are coming to our office on Monday"], | |
inp, | |
label="HAM examples") | |
demo.launch() |