import os import codecs import random import nltk from nltk import NaiveBayesClassifier, classify, word_tokenize from nltk import Text import gradio as gr nltk.download("punkt") def read_in(folder): file_list = os.listdir(folder) content_list = [] for doc in file_list: if not doc.startswith("."): doc_read = codecs.open(folder + doc, mode="r", encoding="ISO-8859-1", errors="ignore") content_list.append(doc_read.read()) doc_read.close() return content_list ham_list = read_in("ham/") spam_list = read_in("spam/") all_emails = [(email, "ham") for email in ham_list] all_emails += [(email, "spam") for email in spam_list] random.seed(42) random.shuffle(all_emails) def get_features(content): word_list = word_tokenize(content.lower()) features = {} for word in word_list: features[word] = True return features all_features = [(get_features(email), label) for (email, label) in all_emails] def train(content, proportion): sample_size = int(len(content) * proportion) train_set = all_features[:sample_size] test_set = all_features[sample_size:] classifier = NaiveBayesClassifier.train(train_set) return train_set, test_set, classifier train_set, test_set, classifier = train(all_features, .80) def evaluate(train_set, test_set, classifier): print(f"Accuracy for train set: {classify.accuracy(classifier, train_set)}") print(f"Accuracy for test set: {classify.accuracy(classifier, test_set)}") NaiveBayesClassifier.show_most_informative_features(classifier) # evaluate(train_set, test_set, classifier) def concordance(data_list, search_word): for data in data_list: word_list = word_tokenize(data.lower()) text_list = Text(word_list) if search_word in text_list: text_list.concordance(search_word) # print(f"stock in HAM") # concordance(ham_list, "stock") # print(f"stock in SPAM") # concordance(spam_list, "stock") # test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"] # test_ham_list = ["See the minutes from the last meeting attached", # "Investors are coming to our office on Monday"] # # test_all_emails = [(email, "spam") for email in test_spam_list] # test_all_emails += [(email, "ham") for email in test_ham_list] # # set_test_email = [(get_features(email), label) for (email, label) in test_all_emails] # # evaluate(train_set, test_set, classifier) def filter_email(email): return classifier.classify(get_features(email)) with gr.Blocks() as demo: gr.Markdown(""" # Spam filter This spam filter will help you to know if your mail is a real one (ham) or spam. """) inp = gr.TextArea(max_lines=20, placeholder="Enter email to classify", label="Input") out = gr.Label() inp.change(fn=filter_email, inputs=inp, outputs=out) gr.Examples(["Participate in our new lottery!", "Try out this new medicine"], inp, label="SPAM examples") gr.Examples(["See the minutes from the last meeting attached", "Investors are coming to our office on Monday"], inp, label="HAM examples") demo.launch()