Spaces:

BIOML
/

test

Running

File size: 5,603 Bytes

d1b2e47

from modules import *
from pathlib import Path
import pandas as pd
from flask import Flask, render_template, request
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from joblib import load
import sklearn
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('wordnet')

def check_file_type(file):
    file_extension = Path(file.filename).suffix.lower()
    if file_extension == '.eml' or file_extension == '.txt':
        save_file(file)
        return 'Extracted Features'
        # return get_features('email files/' + file.filename)
    else:
        return "Please select .eml or .txt file."

def save_file(file):
    file_path = 'email files/' + file.filename
    with open(file_path, 'w') as f:
        f.write(file.read().decode('utf-8'))

def text_feature(filepath):
    text = get_text(filepath)
    # print(text)
    if text != "":
        text = text.split()
        textlist = ' '.join(text)
        dataf = pd.DataFrame([[textlist]], columns=['text'])
        return dataf

def html_tags_feature(filepath):
    tags = get_tags_from_html(get_html_general(filepath))
    taglist = ' '.join(tags) if tags !=[] else []
    dataf = pd.DataFrame([[taglist]], columns=['tags'])
    return dataf

def extra_feature(filepath):
    spf = check_spf(filepath)
    dkim = check_dkim(filepath)
    dmarc = check_dmarc(filepath)
    deliver_receiver = check_deliver_receiver(filepath)
    encript = check_encript(filepath)
    onclick = get_onclicks(filepath)
    popwindow = check_popWindow(filepath)
    extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
    extra_data_row = [0 if x is None else x for x in extra_data_row]
    extra_data_row = [1 if x is True else x for x in extra_data_row]
    extra_data_row = [0 if x is False else x for x in extra_data_row]
    extra_data = pd.DataFrame([extra_data_row],
                              columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
    return extra_data

def num_feature(filepath):
    body_richness = get_body_richness(filepath)
    func_words = get_num_FunctionWords(filepath)
    sbj_richness = get_sbj_richness(filepath)
    urls = get_num_urls(filepath)
    ipurls = get_num_urls_ip(filepath)
    imageurls = get_num_image_urls(filepath)
    domainurls = get_num_domain_urls(filepath)
    urlport = get_num_url_ports(filepath)
    sen_chars = get_chars_sender(filepath)
    num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
    num_data_row = [0 if x is None else x for x in num_data_row]
    num_data = pd.DataFrame([num_data_row],
                            columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
                                     'DomainURLs', 'URLs contain port information', 'Characters in senders'])
    return num_data
def get_features(filepath):
    # text
    textlist = text_feature(filepath)
    # html tags
    taglist = html_tags_feature(filepath)
    #extra feature
    extra_data = extra_feature(filepath)
    # Numeric data

    num_data = num_feature(filepath)
    combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
    # print(combined_df)
    return combined_df


def predict_content(content):
    content_clf = load("save_models/SVM_finalcontent.pkl")
    predict = content_clf.predict(preprocess_content(content))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_html(html_tag):
    html_clf = load("save_models/Stack_tag.pkl")
    predict = html_clf.predict(preprocess_html(html_tag))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_num(num_df):
    num_clf = load("save_models/RF_Num.pkl")
    predict = num_clf.predict(preprocess_num(num_df))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_extra(extra_df):
    extra_clf = load("save_models/RF_extra.pkl")
    predict = extra_clf.predict(preprocess_extra(extra_df))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def preprocess_content(content):
    with open('vectorizer/content_tfidf.pickle', 'rb') as f:
        tfidf = pickle.load(f)
    # Transform feature input to TF-IDF
    content_tfidf = tfidf.transform(content)
    return content_tfidf

def preprocess_html(html_tag):
    with open('vectorizer/html_cv.pickle', 'rb') as f:
        cv = pickle.load(f)
    tag_data = cv.transform(html_tag)
    return tag_data

def preprocess_num(num_df):
    with open('vectorizer/num_scaler.pkl', 'rb') as f:
        num_scaler = pickle.load(f)
    scale_num = num_scaler.transform(num_df.values)
    return scale_num

def preprocess_extra(extra_df):
    with open('vectorizer/extra_scaler.pkl', 'rb') as f:
        extra_scaler = pickle.load(f)
    scale_extra = extra_scaler.transform(extra_df.values)
    return scale_extra


lemmatizer = WordNetLemmatizer()
def customtokenize(str):
    # Split string as tokens
    tokens = nltk.word_tokenize(str)
    # Filter for stopwords
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    # Perform lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
    return lemmatized