Spaces:
Runtime error
Runtime error
File size: 5,041 Bytes
01cbf36 56c461a 01cbf36 bf6dd5d 01cbf36 ec5e027 bf6dd5d 01cbf36 ec5e027 01cbf36 ec5e027 01cbf36 ec5e027 01cbf36 ec5e027 01cbf36 ec5e027 01cbf36 ec5e027 01cbf36 ec5e027 01cbf36 ec5e027 01cbf36 2d54089 a68c21b 01cbf36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from modules import *
from pathlib import Path
import pandas as pd
from flask import Flask, render_template, request
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from joblib import load
import sklearn
import ssl
import os
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
def text_feature(file):
text = get_text(file)
# print(text)
if text != "":
text = text.split()
textlist = ' '.join(text)
dataf = pd.DataFrame([[textlist]], columns=['text'])
return dataf
def html_tags_feature(file):
tags = get_tags_from_html(get_html_general(file))
taglist = ' '.join(tags) if tags !=[] else []
dataf = pd.DataFrame([[taglist]], columns=['tags'])
return dataf
def extra_feature(file):
spf = check_spf(file)
dkim = check_dkim(file)
dmarc = check_dmarc(file)
deliver_receiver = check_deliver_receiver(file)
encript = check_encript(file)
onclick = get_onclicks(file)
popwindow = check_popWindow(file)
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
extra_data_row = [0 if x is None else x for x in extra_data_row]
extra_data_row = [1 if x is True else x for x in extra_data_row]
extra_data_row = [0 if x is False else x for x in extra_data_row]
extra_data = pd.DataFrame([extra_data_row],
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
return extra_data
def num_feature(file):
body_richness = get_body_richness(file)
func_words = get_num_FunctionWords(file)
sbj_richness = get_sbj_richness(file)
urls = get_num_urls(file)
ipurls = get_num_urls_ip(file)
imageurls = get_num_image_urls(file)
domainurls = get_num_domain_urls(file)
urlport = get_num_url_ports(file)
sen_chars = get_chars_sender(file)
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
num_data_row = [0 if x is None else x for x in num_data_row]
num_data = pd.DataFrame([num_data_row],
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
'DomainURLs', 'URLs contain port information', 'Characters in senders'])
return num_data
def get_features(file):
# text
textlist = text_feature(file)
# html tags
taglist = html_tags_feature(file)
#extra feature
extra_data = extra_feature(file)
# Numeric data
num_data = num_feature(file)
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
# print(combined_df)
return combined_df
def predict_content(content):
content_clf = load("save_models/SVM_finalcontent.pkl")
predict = content_clf.predict(preprocess_content(content))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_html(html_tag):
html_clf = load("save_models/Stack_tag.pkl")
predict = html_clf.predict(preprocess_html(html_tag))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_num(num_df):
num_clf = load("save_models/RF_Num.pkl")
predict = num_clf.predict(preprocess_num(num_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_extra(extra_df):
extra_clf = load("save_models/RF_extra.pkl")
predict = extra_clf.predict(preprocess_extra(extra_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def preprocess_content(content):
with open('vectorizer/content_tfidf.pickle', 'rb') as f:
tfidf = pickle.load(f)
# Transform feature input to TF-IDF
# print(content)
content_tfidf = tfidf.transform(content)
return content_tfidf
def preprocess_html(html_tag):
with open('vectorizer/html_cv.pickle', 'rb') as f:
cv = pickle.load(f)
tag_data = cv.transform(html_tag)
return tag_data
def preprocess_num(num_df):
with open('vectorizer/num_scaler.pkl', 'rb') as f:
num_scaler = pickle.load(f)
scale_num = num_scaler.transform(num_df.values)
return scale_num
def preprocess_extra(extra_df):
with open('vectorizer/extra_scaler.pkl', 'rb') as f:
extra_scaler = pickle.load(f)
scale_extra = extra_scaler.transform(extra_df.values)
return scale_extra
lemmatizer = WordNetLemmatizer()
def customtokenize(str):
# Split string as tokens
tokens = nltk.word_tokenize(str)
# Filter for stopwords
nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
# Perform lemmatization
lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
return lemmatized |