test / analze.py
BIOML's picture
Upload 18 files
d1b2e47
from modules import *
from pathlib import Path
import pandas as pd
from flask import Flask, render_template, request
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from joblib import load
import sklearn
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('wordnet')
def check_file_type(file):
file_extension = Path(file.filename).suffix.lower()
if file_extension == '.eml' or file_extension == '.txt':
save_file(file)
return 'Extracted Features'
# return get_features('email files/' + file.filename)
else:
return "Please select .eml or .txt file."
def save_file(file):
file_path = 'email files/' + file.filename
with open(file_path, 'w') as f:
f.write(file.read().decode('utf-8'))
def text_feature(filepath):
text = get_text(filepath)
# print(text)
if text != "":
text = text.split()
textlist = ' '.join(text)
dataf = pd.DataFrame([[textlist]], columns=['text'])
return dataf
def html_tags_feature(filepath):
tags = get_tags_from_html(get_html_general(filepath))
taglist = ' '.join(tags) if tags !=[] else []
dataf = pd.DataFrame([[taglist]], columns=['tags'])
return dataf
def extra_feature(filepath):
spf = check_spf(filepath)
dkim = check_dkim(filepath)
dmarc = check_dmarc(filepath)
deliver_receiver = check_deliver_receiver(filepath)
encript = check_encript(filepath)
onclick = get_onclicks(filepath)
popwindow = check_popWindow(filepath)
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
extra_data_row = [0 if x is None else x for x in extra_data_row]
extra_data_row = [1 if x is True else x for x in extra_data_row]
extra_data_row = [0 if x is False else x for x in extra_data_row]
extra_data = pd.DataFrame([extra_data_row],
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
return extra_data
def num_feature(filepath):
body_richness = get_body_richness(filepath)
func_words = get_num_FunctionWords(filepath)
sbj_richness = get_sbj_richness(filepath)
urls = get_num_urls(filepath)
ipurls = get_num_urls_ip(filepath)
imageurls = get_num_image_urls(filepath)
domainurls = get_num_domain_urls(filepath)
urlport = get_num_url_ports(filepath)
sen_chars = get_chars_sender(filepath)
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
num_data_row = [0 if x is None else x for x in num_data_row]
num_data = pd.DataFrame([num_data_row],
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
'DomainURLs', 'URLs contain port information', 'Characters in senders'])
return num_data
def get_features(filepath):
# text
textlist = text_feature(filepath)
# html tags
taglist = html_tags_feature(filepath)
#extra feature
extra_data = extra_feature(filepath)
# Numeric data
num_data = num_feature(filepath)
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
# print(combined_df)
return combined_df
def predict_content(content):
content_clf = load("save_models/SVM_finalcontent.pkl")
predict = content_clf.predict(preprocess_content(content))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_html(html_tag):
html_clf = load("save_models/Stack_tag.pkl")
predict = html_clf.predict(preprocess_html(html_tag))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_num(num_df):
num_clf = load("save_models/RF_Num.pkl")
predict = num_clf.predict(preprocess_num(num_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_extra(extra_df):
extra_clf = load("save_models/RF_extra.pkl")
predict = extra_clf.predict(preprocess_extra(extra_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def preprocess_content(content):
with open('vectorizer/content_tfidf.pickle', 'rb') as f:
tfidf = pickle.load(f)
# Transform feature input to TF-IDF
content_tfidf = tfidf.transform(content)
return content_tfidf
def preprocess_html(html_tag):
with open('vectorizer/html_cv.pickle', 'rb') as f:
cv = pickle.load(f)
tag_data = cv.transform(html_tag)
return tag_data
def preprocess_num(num_df):
with open('vectorizer/num_scaler.pkl', 'rb') as f:
num_scaler = pickle.load(f)
scale_num = num_scaler.transform(num_df.values)
return scale_num
def preprocess_extra(extra_df):
with open('vectorizer/extra_scaler.pkl', 'rb') as f:
extra_scaler = pickle.load(f)
scale_extra = extra_scaler.transform(extra_df.values)
return scale_extra
lemmatizer = WordNetLemmatizer()
def customtokenize(str):
# Split string as tokens
tokens = nltk.word_tokenize(str)
# Filter for stopwords
nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
# Perform lemmatization
lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
return lemmatized