PEF / analze.py
allenchienxxx's picture
Update analze.py
5e8c25d
from modules import *
from pathlib import Path
import pandas as pd
from flask import Flask, render_template, request
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from joblib import load
import sklearn
import ssl
import os
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
def text_feature(file):
text = get_text(file)
# print(text)
if text != "":
text = text.split()
textlist = ' '.join(text)
dataf = pd.DataFrame([[textlist]], columns=['text'])
return dataf
def html_tags_feature(file):
tags = get_tags_from_html(get_html_general(file))
taglist = ' '.join(tags) if tags !=[] else []
dataf = pd.DataFrame([[taglist]], columns=['tags'])
return dataf
def extra_feature(file):
spf = check_spf(file)
dkim = check_dkim(file)
dmarc = check_dmarc(file)
deliver_receiver = check_deliver_receiver(file)
encript = check_encript(file)
onclick = get_onclicks(file)
popwindow = check_popWindow(file)
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
extra_data_row = [0 if x is None else x for x in extra_data_row]
extra_data_row = [1 if x is True else x for x in extra_data_row]
extra_data_row = [0 if x is False else x for x in extra_data_row]
extra_data = pd.DataFrame([extra_data_row],
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
return extra_data
def num_feature(file):
body_richness = get_body_richness(file)
func_words = get_num_FunctionWords(file)
sbj_richness = get_sbj_richness(file)
urls = get_num_urls(file)
ipurls = get_num_urls_ip(file)
imageurls = get_num_image_urls(file)
domainurls = get_num_domain_urls(file)
urlport = get_num_url_ports(file)
sen_chars = get_chars_sender(file)
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
num_data_row = [0 if x is None else x for x in num_data_row]
num_data = pd.DataFrame([num_data_row],
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
'DomainURLs', 'URLs contain port information', 'Characters in senders'])
return num_data
def get_features(file):
# text
textlist = text_feature(file)
# html tags
taglist = html_tags_feature(file)
#extra feature
extra_data = extra_feature(file)
# Numeric data
num_data = num_feature(file)
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
# print(combined_df)
return combined_df
def predict_content(content):
content_clf = load("save_models/SVM_finalcontent.pkl")
predict = content_clf.predict(preprocess_content(content))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_html(html_tag):
html_clf = load("save_models/Stack_tag.pkl")
predict = html_clf.predict(preprocess_html(html_tag))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_num(num_df):
num_clf = load("save_models/RF_Num.pkl")
predict = num_clf.predict(preprocess_num(num_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_extra(extra_df):
extra_clf = load("save_models/RF_extra.pkl")
predict = extra_clf.predict(preprocess_extra(extra_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def preprocess_content(content):
with open('vectorizer/content_tfidf.pickle', 'rb') as f:
tfidf = pickle.load(f)
# Transform feature input to TF-IDF
# print(content)
content_tfidf = tfidf.transform(content)
return content_tfidf
def preprocess_html(html_tag):
with open('vectorizer/html_cv.pickle', 'rb') as f:
cv = pickle.load(f)
tag_data = cv.transform(html_tag)
return tag_data
def preprocess_num(num_df):
with open('vectorizer/num_scaler.pkl', 'rb') as f:
num_scaler = pickle.load(f)
scale_num = num_scaler.transform(num_df.values)
return scale_num
def preprocess_extra(extra_df):
with open('vectorizer/extra_scaler.pkl', 'rb') as f:
extra_scaler = pickle.load(f)
scale_extra = extra_scaler.transform(extra_df.values)
return scale_extra
lemmatizer = WordNetLemmatizer()
def customtokenize(str):
# Split string as tokens
tokens = nltk.word_tokenize(str)
# Filter for stopwords
nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
# Perform lemmatization
lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
return lemmatized