File size: 3,224 Bytes
f32e250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import codecs
import random
import nltk
from nltk import NaiveBayesClassifier, classify, word_tokenize
from nltk import Text
import gradio as gr

nltk.download("punkt")


def read_in(folder):
    file_list = os.listdir(folder)
    content_list = []
    for doc in file_list:
        if not doc.startswith("."):
            doc_read = codecs.open(folder + doc, mode="r", encoding="ISO-8859-1", errors="ignore")
            content_list.append(doc_read.read())
            doc_read.close()
    return content_list


ham_list = read_in("ham/")
spam_list = read_in("spam/")

all_emails = [(email, "ham") for email in ham_list]
all_emails += [(email, "spam") for email in spam_list]
random.seed(42)
random.shuffle(all_emails)


def get_features(content):
    word_list = word_tokenize(content.lower())
    features = {}
    for word in word_list:
        features[word] = True
    return features


all_features = [(get_features(email), label) for (email, label) in all_emails]


def train(content, proportion):
    sample_size = int(len(content) * proportion)
    train_set = all_features[:sample_size]
    test_set = all_features[sample_size:]

    classifier = NaiveBayesClassifier.train(train_set)

    return train_set, test_set, classifier


train_set, test_set, classifier = train(all_features, .80)


def evaluate(train_set, test_set, classifier):
    print(f"Accuracy for train set: {classify.accuracy(classifier, train_set)}")
    print(f"Accuracy for test set: {classify.accuracy(classifier, test_set)}")

    NaiveBayesClassifier.show_most_informative_features(classifier)


# evaluate(train_set, test_set, classifier)


def concordance(data_list, search_word):
    for data in data_list:
        word_list = word_tokenize(data.lower())
        text_list = Text(word_list)
        if search_word in text_list:
            text_list.concordance(search_word)


# print(f"stock in HAM")
# concordance(ham_list, "stock")
# print(f"stock in SPAM")
# concordance(spam_list, "stock")


# test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"]
# test_ham_list = ["See the minutes from the last meeting attached",
#                  "Investors are coming to our office on Monday"]
#
# test_all_emails = [(email, "spam") for email in test_spam_list]
# test_all_emails += [(email, "ham") for email in test_ham_list]
#
# set_test_email = [(get_features(email), label) for (email, label) in test_all_emails]
#
# evaluate(train_set, test_set, classifier)


def filter_email(email):
    return classifier.classify(get_features(email))


with gr.Blocks() as demo:
    gr.Markdown("""
    # Spam filter
    This spam filter will help you to know if your mail is a real one (ham) or spam.
    """)
    inp = gr.TextArea(max_lines=20, placeholder="Enter email to classify", label="Input")
    out = gr.Label()
    inp.change(fn=filter_email, inputs=inp, outputs=out)
    gr.Examples(["Participate in our new lottery!", "Try out this new medicine"],
                inp,
                label="SPAM examples")
    gr.Examples(["See the minutes from the last meeting attached", "Investors are coming to our office on Monday"],
                inp,
                label="HAM examples")


demo.launch()