|
from modules import * |
|
from pathlib import Path |
|
import pandas as pd |
|
from flask import Flask, render_template, request |
|
import nltk |
|
import pickle |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from joblib import load |
|
import sklearn |
|
import ssl |
|
|
|
try: |
|
_create_unverified_https_context = ssl._create_unverified_context |
|
except AttributeError: |
|
pass |
|
else: |
|
ssl._create_default_https_context = _create_unverified_https_context |
|
|
|
|
|
|
|
|
|
|
|
def check_file_type(file): |
|
file_extension = Path(file.filename).suffix.lower() |
|
if file_extension == '.eml' or file_extension == '.txt': |
|
save_file(file) |
|
return 'Extracted Features' |
|
|
|
else: |
|
return "Please select .eml or .txt file." |
|
|
|
def save_file(file): |
|
file_path = 'email files/' + file.filename |
|
with open(file_path, 'w') as f: |
|
f.write(file.read().decode('utf-8')) |
|
|
|
def text_feature(filepath): |
|
text = get_text(filepath) |
|
|
|
if text != "": |
|
text = text.split() |
|
textlist = ' '.join(text) |
|
dataf = pd.DataFrame([[textlist]], columns=['text']) |
|
return dataf |
|
|
|
def html_tags_feature(filepath): |
|
tags = get_tags_from_html(get_html_general(filepath)) |
|
taglist = ' '.join(tags) if tags !=[] else [] |
|
dataf = pd.DataFrame([[taglist]], columns=['tags']) |
|
return dataf |
|
|
|
def extra_feature(filepath): |
|
spf = check_spf(filepath) |
|
dkim = check_dkim(filepath) |
|
dmarc = check_dmarc(filepath) |
|
deliver_receiver = check_deliver_receiver(filepath) |
|
encript = check_encript(filepath) |
|
onclick = get_onclicks(filepath) |
|
popwindow = check_popWindow(filepath) |
|
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow] |
|
extra_data_row = [0 if x is None else x for x in extra_data_row] |
|
extra_data_row = [1 if x is True else x for x in extra_data_row] |
|
extra_data_row = [0 if x is False else x for x in extra_data_row] |
|
extra_data = pd.DataFrame([extra_data_row], |
|
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow']) |
|
return extra_data |
|
|
|
def num_feature(filepath): |
|
body_richness = get_body_richness(filepath) |
|
func_words = get_num_FunctionWords(filepath) |
|
sbj_richness = get_sbj_richness(filepath) |
|
urls = get_num_urls(filepath) |
|
ipurls = get_num_urls_ip(filepath) |
|
imageurls = get_num_image_urls(filepath) |
|
domainurls = get_num_domain_urls(filepath) |
|
urlport = get_num_url_ports(filepath) |
|
sen_chars = get_chars_sender(filepath) |
|
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars] |
|
num_data_row = [0 if x is None else x for x in num_data_row] |
|
num_data = pd.DataFrame([num_data_row], |
|
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs', |
|
'DomainURLs', 'URLs contain port information', 'Characters in senders']) |
|
return num_data |
|
def get_features(filepath): |
|
|
|
textlist = text_feature(filepath) |
|
|
|
taglist = html_tags_feature(filepath) |
|
|
|
extra_data = extra_feature(filepath) |
|
|
|
|
|
num_data = num_feature(filepath) |
|
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1) |
|
|
|
return combined_df |
|
|
|
|
|
def predict_content(content): |
|
content_clf = load("save_models/SVM_finalcontent.pkl") |
|
predict = content_clf.predict(preprocess_content(content)) |
|
return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
|
def predict_html(html_tag): |
|
html_clf = load("save_models/Stack_tag.pkl") |
|
predict = html_clf.predict(preprocess_html(html_tag)) |
|
return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
|
def predict_num(num_df): |
|
num_clf = load("save_models/RF_Num.pkl") |
|
predict = num_clf.predict(preprocess_num(num_df)) |
|
return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
|
def predict_extra(extra_df): |
|
extra_clf = load("save_models/RF_extra.pkl") |
|
predict = extra_clf.predict(preprocess_extra(extra_df)) |
|
return "Legitimate" if predict[0]=='ham' else "Phishing" |
|
|
|
def preprocess_content(content): |
|
with open('vectorizer/content_tfidf.pickle', 'rb') as f: |
|
tfidf = pickle.load(f) |
|
|
|
content_tfidf = tfidf.transform(content) |
|
return content_tfidf |
|
|
|
def preprocess_html(html_tag): |
|
with open('vectorizer/html_cv.pickle', 'rb') as f: |
|
cv = pickle.load(f) |
|
tag_data = cv.transform(html_tag) |
|
return tag_data |
|
|
|
def preprocess_num(num_df): |
|
with open('vectorizer/num_scaler.pkl', 'rb') as f: |
|
num_scaler = pickle.load(f) |
|
scale_num = num_scaler.transform(num_df.values) |
|
return scale_num |
|
|
|
def preprocess_extra(extra_df): |
|
with open('vectorizer/extra_scaler.pkl', 'rb') as f: |
|
extra_scaler = pickle.load(f) |
|
scale_extra = extra_scaler.transform(extra_df.values) |
|
return scale_extra |
|
|
|
|
|
lemmatizer = WordNetLemmatizer() |
|
def customtokenize(str): |
|
|
|
tokens = nltk.word_tokenize(str) |
|
|
|
nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens)) |
|
|
|
lemmatized = [lemmatizer.lemmatize(word) for word in nostop] |
|
return lemmatized |