Spaces:
Runtime error
Runtime error
from modules import * | |
from pathlib import Path | |
import pandas as pd | |
from flask import Flask, render_template, request | |
import nltk | |
import pickle | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from joblib import load | |
import sklearn | |
import ssl | |
import os | |
try: | |
_create_unverified_https_context = ssl._create_unverified_context | |
except AttributeError: | |
pass | |
else: | |
ssl._create_default_https_context = _create_unverified_https_context | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
nltk.download('omw-1.4') | |
nltk.download('wordnet') | |
def text_feature(file): | |
text = get_text(file) | |
# print(text) | |
if text != "": | |
text = text.split() | |
textlist = ' '.join(text) | |
dataf = pd.DataFrame([[textlist]], columns=['text']) | |
return dataf | |
def html_tags_feature(file): | |
tags = get_tags_from_html(get_html_general(file)) | |
taglist = ' '.join(tags) if tags !=[] else [] | |
dataf = pd.DataFrame([[taglist]], columns=['tags']) | |
return dataf | |
def extra_feature(file): | |
spf = check_spf(file) | |
dkim = check_dkim(file) | |
dmarc = check_dmarc(file) | |
deliver_receiver = check_deliver_receiver(file) | |
encript = check_encript(file) | |
onclick = get_onclicks(file) | |
popwindow = check_popWindow(file) | |
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow] | |
extra_data_row = [0 if x is None else x for x in extra_data_row] | |
extra_data_row = [1 if x is True else x for x in extra_data_row] | |
extra_data_row = [0 if x is False else x for x in extra_data_row] | |
extra_data = pd.DataFrame([extra_data_row], | |
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow']) | |
return extra_data | |
def num_feature(file): | |
body_richness = get_body_richness(file) | |
func_words = get_num_FunctionWords(file) | |
sbj_richness = get_sbj_richness(file) | |
urls = get_num_urls(file) | |
ipurls = get_num_urls_ip(file) | |
imageurls = get_num_image_urls(file) | |
domainurls = get_num_domain_urls(file) | |
urlport = get_num_url_ports(file) | |
sen_chars = get_chars_sender(file) | |
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars] | |
num_data_row = [0 if x is None else x for x in num_data_row] | |
num_data = pd.DataFrame([num_data_row], | |
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs', | |
'DomainURLs', 'URLs contain port information', 'Characters in senders']) | |
return num_data | |
def get_features(file): | |
# text | |
textlist = text_feature(file) | |
# html tags | |
taglist = html_tags_feature(file) | |
#extra feature | |
extra_data = extra_feature(file) | |
# Numeric data | |
num_data = num_feature(file) | |
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1) | |
# print(combined_df) | |
return combined_df | |
def predict_content(content): | |
content_clf = load("save_models/SVM_finalcontent.pkl") | |
predict = content_clf.predict(preprocess_content(content)) | |
return "Legitimate" if predict[0]=='ham' else "Phishing" | |
def predict_html(html_tag): | |
html_clf = load("save_models/Stack_tag.pkl") | |
predict = html_clf.predict(preprocess_html(html_tag)) | |
return "Legitimate" if predict[0]=='ham' else "Phishing" | |
def predict_num(num_df): | |
num_clf = load("save_models/RF_Num.pkl") | |
predict = num_clf.predict(preprocess_num(num_df)) | |
return "Legitimate" if predict[0]=='ham' else "Phishing" | |
def predict_extra(extra_df): | |
extra_clf = load("save_models/RF_extra.pkl") | |
predict = extra_clf.predict(preprocess_extra(extra_df)) | |
return "Legitimate" if predict[0]=='ham' else "Phishing" | |
def preprocess_content(content): | |
with open('vectorizer/content_tfidf.pickle', 'rb') as f: | |
tfidf = pickle.load(f) | |
# Transform feature input to TF-IDF | |
# print(content) | |
content_tfidf = tfidf.transform(content) | |
return content_tfidf | |
def preprocess_html(html_tag): | |
with open('vectorizer/html_cv.pickle', 'rb') as f: | |
cv = pickle.load(f) | |
tag_data = cv.transform(html_tag) | |
return tag_data | |
def preprocess_num(num_df): | |
with open('vectorizer/num_scaler.pkl', 'rb') as f: | |
num_scaler = pickle.load(f) | |
scale_num = num_scaler.transform(num_df.values) | |
return scale_num | |
def preprocess_extra(extra_df): | |
with open('vectorizer/extra_scaler.pkl', 'rb') as f: | |
extra_scaler = pickle.load(f) | |
scale_extra = extra_scaler.transform(extra_df.values) | |
return scale_extra | |
lemmatizer = WordNetLemmatizer() | |
def customtokenize(str): | |
# Split string as tokens | |
tokens = nltk.word_tokenize(str) | |
# Filter for stopwords | |
nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens)) | |
# Perform lemmatization | |
lemmatized = [lemmatizer.lemmatize(word) for word in nostop] | |
return lemmatized |