File size: 5,041 Bytes
01cbf36
 
 
 
 
 
 
 
 
 
 
56c461a
01cbf36
 
 
 
 
 
 
bf6dd5d
 
 
 
01cbf36
ec5e027
 
bf6dd5d
01cbf36
 
 
 
 
 
ec5e027
 
01cbf36
 
 
 
ec5e027
 
 
 
 
 
 
 
01cbf36
 
 
 
 
 
 
 
ec5e027
 
 
 
 
 
 
 
 
 
01cbf36
 
 
 
 
 
ec5e027
01cbf36
ec5e027
01cbf36
ec5e027
01cbf36
ec5e027
01cbf36
 
ec5e027
01cbf36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d54089
a68c21b
01cbf36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from modules import *
from pathlib import Path
import pandas as pd
from flask import Flask, render_template, request
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from joblib import load
import sklearn
import ssl
import os

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

def text_feature(file):
    text = get_text(file)
    # print(text)
    if text != "":
        text = text.split()
        textlist = ' '.join(text)
        dataf = pd.DataFrame([[textlist]], columns=['text'])
        return dataf

def html_tags_feature(file):
    tags = get_tags_from_html(get_html_general(file))
    taglist = ' '.join(tags) if tags !=[] else []
    dataf = pd.DataFrame([[taglist]], columns=['tags'])
    return dataf

def extra_feature(file):
    spf = check_spf(file)
    dkim = check_dkim(file)
    dmarc = check_dmarc(file)
    deliver_receiver = check_deliver_receiver(file)
    encript = check_encript(file)
    onclick = get_onclicks(file)
    popwindow = check_popWindow(file)
    extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
    extra_data_row = [0 if x is None else x for x in extra_data_row]
    extra_data_row = [1 if x is True else x for x in extra_data_row]
    extra_data_row = [0 if x is False else x for x in extra_data_row]
    extra_data = pd.DataFrame([extra_data_row],
                              columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
    return extra_data

def num_feature(file):
    body_richness = get_body_richness(file)
    func_words = get_num_FunctionWords(file)
    sbj_richness = get_sbj_richness(file)
    urls = get_num_urls(file)
    ipurls = get_num_urls_ip(file)
    imageurls = get_num_image_urls(file)
    domainurls = get_num_domain_urls(file)
    urlport = get_num_url_ports(file)
    sen_chars = get_chars_sender(file)
    num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
    num_data_row = [0 if x is None else x for x in num_data_row]
    num_data = pd.DataFrame([num_data_row],
                            columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
                                     'DomainURLs', 'URLs contain port information', 'Characters in senders'])
    return num_data
def get_features(file):
    # text
    textlist = text_feature(file)
    # html tags
    taglist = html_tags_feature(file)
    #extra feature
    extra_data = extra_feature(file)
    # Numeric data

    num_data = num_feature(file)
    combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
    # print(combined_df)
    return combined_df


def predict_content(content):
    content_clf = load("save_models/SVM_finalcontent.pkl")
    predict = content_clf.predict(preprocess_content(content))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_html(html_tag):
    html_clf = load("save_models/Stack_tag.pkl")
    predict = html_clf.predict(preprocess_html(html_tag))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_num(num_df):
    num_clf = load("save_models/RF_Num.pkl")
    predict = num_clf.predict(preprocess_num(num_df))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_extra(extra_df):
    extra_clf = load("save_models/RF_extra.pkl")
    predict = extra_clf.predict(preprocess_extra(extra_df))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def preprocess_content(content):
    with open('vectorizer/content_tfidf.pickle', 'rb') as f:
        tfidf = pickle.load(f)
    # Transform feature input to TF-IDF
    # print(content)
    content_tfidf = tfidf.transform(content)
    return content_tfidf

def preprocess_html(html_tag):
    with open('vectorizer/html_cv.pickle', 'rb') as f:
        cv = pickle.load(f)
    tag_data = cv.transform(html_tag)
    return tag_data

def preprocess_num(num_df):
    with open('vectorizer/num_scaler.pkl', 'rb') as f:
        num_scaler = pickle.load(f)
    scale_num = num_scaler.transform(num_df.values)
    return scale_num

def preprocess_extra(extra_df):
    with open('vectorizer/extra_scaler.pkl', 'rb') as f:
        extra_scaler = pickle.load(f)
    scale_extra = extra_scaler.transform(extra_df.values)
    return scale_extra


lemmatizer = WordNetLemmatizer()
def customtokenize(str):
    # Split string as tokens
    tokens = nltk.word_tokenize(str)
    # Filter for stopwords
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    # Perform lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
    return lemmatized