allenchienxxx commited on
Commit
01cbf36
1 Parent(s): 0cc4e8f

Upload 13 files

Browse files
analze.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules import *
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ from flask import Flask, render_template, request
5
+ import nltk
6
+ import pickle
7
+ from nltk.corpus import stopwords
8
+ from nltk.stem import WordNetLemmatizer
9
+ from joblib import load
10
+ import sklearn
11
+ import ssl
12
+
13
+ try:
14
+ _create_unverified_https_context = ssl._create_unverified_context
15
+ except AttributeError:
16
+ pass
17
+ else:
18
+ ssl._create_default_https_context = _create_unverified_https_context
19
+ # nltk.download('stopwords')
20
+ # nltk.download('punkt')
21
+ # nltk.download('omw-1.4')
22
+ # nltk.download('wordnet')
23
+
24
+ def check_file_type(file):
25
+ file_extension = Path(file.filename).suffix.lower()
26
+ if file_extension == '.eml' or file_extension == '.txt':
27
+ save_file(file)
28
+ return 'Extracted Features'
29
+ # return get_features('email files/' + file.filename)
30
+ else:
31
+ return "Please select .eml or .txt file."
32
+
33
+ def save_file(file):
34
+ file_path = 'email files/' + file.filename
35
+ with open(file_path, 'w') as f:
36
+ f.write(file.read().decode('utf-8'))
37
+
38
+ def text_feature(filepath):
39
+ text = get_text(filepath)
40
+ # print(text)
41
+ if text != "":
42
+ text = text.split()
43
+ textlist = ' '.join(text)
44
+ dataf = pd.DataFrame([[textlist]], columns=['text'])
45
+ return dataf
46
+
47
+ def html_tags_feature(filepath):
48
+ tags = get_tags_from_html(get_html_general(filepath))
49
+ taglist = ' '.join(tags) if tags !=[] else []
50
+ dataf = pd.DataFrame([[taglist]], columns=['tags'])
51
+ return dataf
52
+
53
+ def extra_feature(filepath):
54
+ spf = check_spf(filepath)
55
+ dkim = check_dkim(filepath)
56
+ dmarc = check_dmarc(filepath)
57
+ deliver_receiver = check_deliver_receiver(filepath)
58
+ encript = check_encript(filepath)
59
+ onclick = get_onclicks(filepath)
60
+ popwindow = check_popWindow(filepath)
61
+ extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
62
+ extra_data_row = [0 if x is None else x for x in extra_data_row]
63
+ extra_data_row = [1 if x is True else x for x in extra_data_row]
64
+ extra_data_row = [0 if x is False else x for x in extra_data_row]
65
+ extra_data = pd.DataFrame([extra_data_row],
66
+ columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
67
+ return extra_data
68
+
69
+ def num_feature(filepath):
70
+ body_richness = get_body_richness(filepath)
71
+ func_words = get_num_FunctionWords(filepath)
72
+ sbj_richness = get_sbj_richness(filepath)
73
+ urls = get_num_urls(filepath)
74
+ ipurls = get_num_urls_ip(filepath)
75
+ imageurls = get_num_image_urls(filepath)
76
+ domainurls = get_num_domain_urls(filepath)
77
+ urlport = get_num_url_ports(filepath)
78
+ sen_chars = get_chars_sender(filepath)
79
+ num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
80
+ num_data_row = [0 if x is None else x for x in num_data_row]
81
+ num_data = pd.DataFrame([num_data_row],
82
+ columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
83
+ 'DomainURLs', 'URLs contain port information', 'Characters in senders'])
84
+ return num_data
85
+ def get_features(filepath):
86
+ # text
87
+ textlist = text_feature(filepath)
88
+ # html tags
89
+ taglist = html_tags_feature(filepath)
90
+ #extra feature
91
+ extra_data = extra_feature(filepath)
92
+ # Numeric data
93
+
94
+ num_data = num_feature(filepath)
95
+ combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
96
+ # print(combined_df)
97
+ return combined_df
98
+
99
+
100
+ def predict_content(content):
101
+ content_clf = load("save_models/SVM_finalcontent.pkl")
102
+ predict = content_clf.predict(preprocess_content(content))
103
+ return "Legitimate" if predict[0]=='ham' else "Phishing"
104
+
105
+ def predict_html(html_tag):
106
+ html_clf = load("save_models/Stack_tag.pkl")
107
+ predict = html_clf.predict(preprocess_html(html_tag))
108
+ return "Legitimate" if predict[0]=='ham' else "Phishing"
109
+
110
+ def predict_num(num_df):
111
+ num_clf = load("save_models/RF_Num.pkl")
112
+ predict = num_clf.predict(preprocess_num(num_df))
113
+ return "Legitimate" if predict[0]=='ham' else "Phishing"
114
+
115
+ def predict_extra(extra_df):
116
+ extra_clf = load("save_models/RF_extra.pkl")
117
+ predict = extra_clf.predict(preprocess_extra(extra_df))
118
+ return "Legitimate" if predict[0]=='ham' else "Phishing"
119
+
120
+ def preprocess_content(content):
121
+ with open('vectorizer/content_tfidf.pickle', 'rb') as f:
122
+ tfidf = pickle.load(f)
123
+ # Transform feature input to TF-IDF
124
+ content_tfidf = tfidf.transform(content)
125
+ return content_tfidf
126
+
127
+ def preprocess_html(html_tag):
128
+ with open('vectorizer/html_cv.pickle', 'rb') as f:
129
+ cv = pickle.load(f)
130
+ tag_data = cv.transform(html_tag)
131
+ return tag_data
132
+
133
+ def preprocess_num(num_df):
134
+ with open('vectorizer/num_scaler.pkl', 'rb') as f:
135
+ num_scaler = pickle.load(f)
136
+ scale_num = num_scaler.transform(num_df.values)
137
+ return scale_num
138
+
139
+ def preprocess_extra(extra_df):
140
+ with open('vectorizer/extra_scaler.pkl', 'rb') as f:
141
+ extra_scaler = pickle.load(f)
142
+ scale_extra = extra_scaler.transform(extra_df.values)
143
+ return scale_extra
144
+
145
+
146
+ lemmatizer = WordNetLemmatizer()
147
+ def customtokenize(str):
148
+ # Split string as tokens
149
+ tokens = nltk.word_tokenize(str)
150
+ # Filter for stopwords
151
+ nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
152
+ # Perform lemmatization
153
+ lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
154
+ return lemmatized
main.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from analze import *
2
+
3
+ app = Flask(__name__)
4
+
5
+ @app.route('/')
6
+ def home():
7
+ return render_template('home.html')
8
+
9
+
10
+ @app.route('/upload', methods=['GET', 'POST'])
11
+ def upload_file():
12
+ if request.method == 'POST':
13
+ # Check if a file was uploaded
14
+ if 'file' not in request.files:
15
+ return render_template('home.html', content='No file uploaded.')
16
+ file = request.files['file']
17
+ # Check if the file has a filename
18
+ if file.filename == '':
19
+ return render_template('home.html', content='No file selected.')
20
+ filepath = 'email files/' + file.filename
21
+ return render_template('home.html',
22
+ content=check_file_type(file),
23
+ features = get_features(filepath),
24
+ pre_content=predict_content(text_feature(filepath)),
25
+ pre_tag=predict_html(html_tags_feature(filepath)),
26
+ pre_num=predict_num(num_feature(filepath)),
27
+ pre_extra=predict_extra(extra_feature(filepath)))
28
+
29
+ return render_template('home.html')
30
+
31
+
32
+
33
+ if __name__ == '__main__':
34
+ app.run(host='0.0.0.0', port=8000)
modules.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_text_from_html(html_content):
2
+ soup = BeautifulSoup(html_content, 'html.parser')
3
+ # extract all the texts
4
+ all_text = soup.get_text()
5
+ all_text = re.sub(r"\s+", " ", all_text)
6
+ # print(all_text)
7
+ return all_text
8
+ # get text content type from email
9
+ def get_text(file_path):
10
+ with open(file_path, 'rb') as file:
11
+ message = email.message_from_bytes(file.read())
12
+ text_content = ""
13
+ for part in message.walk():
14
+ if part.get_content_type() == 'text/plain':
15
+ text_content += part.get_payload(decode=True).decode('iso-8859-1')
16
+ # print(text_content)
17
+ return text_content.replace("\n","")
18
+ if text_content == "":
19
+ return get_text_from_html(get_html_general(file_path));
20
+ from bs4 import BeautifulSoup
21
+ import email
22
+ def get_email_html(file_path):
23
+ with open(file_path, 'rb') as file:
24
+ content = email.message_from_bytes(file.read())
25
+ html_content = ""
26
+ for part in content.walk():
27
+ if part.get_content_type() == 'text/html':
28
+ html_content += part.get_payload(decode=True).decode('iso-8859-1')
29
+ html_content.replace("\n","")
30
+ if html_content != "":
31
+ # print("Found html at "+file_path)
32
+ return html_content
33
+ else:
34
+ # print("No html content found at "+file_path)
35
+ return ""
36
+
37
+ #get html by searching for <html> tag
38
+ def get_html(file_path):
39
+ with open(file_path, 'r',encoding='iso-8859-1') as file:
40
+ html_flag = False
41
+ html_content = "";
42
+ tag_list = []
43
+ for line in file:
44
+ words = line.split()
45
+ for word in words:
46
+ if word == "<html>":
47
+ html_flag = True;
48
+ if html_flag:
49
+ html_content += word
50
+ if word == "</html>":
51
+ html_flag = False;
52
+ # print(html_content)
53
+ html_content.replace("\n","")
54
+ if html_content == "":
55
+ # print("No html content found at "+file_path)
56
+ return ""
57
+ else:
58
+ # print("Found html at "+file_path)
59
+ return html_content
60
+
61
+ def get_html_general(file_path):
62
+ if get_email_html(file_path)!="":
63
+ return get_email_html(file_path)
64
+ else:
65
+ return get_html(file_path)
66
+ def get_onclicks(file_path):
67
+ content = get_html_general(file_path)
68
+ if content == "": return None
69
+ soup = BeautifulSoup(content, 'html.parser')
70
+
71
+ elements = soup.find_all(attrs={'onClick': True})
72
+ # Count the number of elements with an onClick attribute
73
+ count = len(elements)
74
+ return count
75
+ def check_popWindow(file_path):
76
+ content = get_html_general(file_path)
77
+ if content == "": return None
78
+ soup = BeautifulSoup(content, 'html.parser')
79
+
80
+ # Check if any <script> tags were found
81
+ try:
82
+ scripts = soup.find_all('script', text=lambda text: 'window.open' in text)
83
+ if scripts:
84
+ return True
85
+ # print('The email body contains a script that attempts to modify the status bar.')
86
+ else:
87
+ # print('The email body does not contain a script that attempts to modify the status bar.')
88
+ return False
89
+ except TypeError:
90
+ return False
91
+
92
+ def check_spf(file_path):
93
+ with open(file_path, 'rb') as file:
94
+ message = email.message_from_bytes(file.read())
95
+ received_spf_header = message.get('Received-SPF')
96
+ if received_spf_header == None:
97
+ return 0
98
+ if received_spf_header:
99
+ spf_result = received_spf_header.split()[0].lower()
100
+ if spf_result == 'pass':
101
+ return 1
102
+ elif spf_result == 'neutral':
103
+ return 2
104
+ elif spf_result == 'softfail':
105
+ return 3
106
+ else:
107
+ return 0
108
+ else:
109
+ return 0
110
+ def check_dkim(file_path):
111
+ with open(file_path, 'rb') as file:
112
+ message = email.message_from_bytes(file.read())
113
+ auth = message.get('Authentication-Results')
114
+ if auth == None:
115
+ return 0
116
+ auth_result = auth.split()
117
+ # print(auth)
118
+ # print(dkim_result)
119
+ if 'dkim=pass' in auth_result:
120
+ return 1
121
+ else:
122
+ return 0
123
+ def check_dmarc(file_path):
124
+ with open(file_path, 'rb') as file:
125
+ message = email.message_from_bytes(file.read())
126
+ auth = message.get('Authentication-Results')
127
+ if auth == None:
128
+ return 0
129
+ auth_result = auth.split()
130
+ # print(auth)
131
+ # print(dkim_result)
132
+ if 'dmarc=pass' in auth_result:
133
+ return 1
134
+ else:
135
+ return 0
136
+ def check_deliver_receiver(filepath):
137
+ with open(filepath, 'rb') as file:
138
+ message = email.message_from_bytes(file.read())
139
+ deliver = message.get('Delivered-To')
140
+ # print(deliver)
141
+ receiver = message.get('To')
142
+ # print(receiver)
143
+ if deliver == receiver:
144
+ return 1
145
+ else:
146
+ return 0
147
+ def check_encript(filepath):
148
+ with open(filepath, 'rb') as file:
149
+ message = email.message_from_bytes(file.read())
150
+ received_headers = message.get_all('Received')
151
+ # print(received_headers)
152
+ version_string = 'version'
153
+ try:
154
+ for received_header in received_headers:
155
+ if version_string in received_header:
156
+ return 1
157
+ except TypeError:
158
+ return 0
159
+ return 0
160
+ def get_tags_from_html(html_content):
161
+ soup = BeautifulSoup(html_content, 'html.parser')
162
+ tag_list = []
163
+ html_tags = soup.find_all()
164
+ for tag in html_tags:
165
+ tag_list += [tag.name]
166
+ # print(tag_list)
167
+ return tag_list
168
+ import ipaddress
169
+ from urllib.parse import urlparse
170
+ import urllib.request
171
+ from bs4 import BeautifulSoup
172
+ import re
173
+ import email
174
+
175
+ #get urls in html content
176
+ def get_urls_from_html(html_content):
177
+ soup = BeautifulSoup(html_content, 'html.parser')
178
+ urls = []
179
+ # get all the urls
180
+ anchor_tags = soup.find_all('a')
181
+ for tag in anchor_tags:
182
+ href = tag.get('href')
183
+ if href:
184
+ if re.match('^https?://', href):
185
+ # print(href)
186
+ urls += [href]
187
+ return urls
188
+ def get_text(file_path):
189
+ with open(file_path, 'rb') as file:
190
+ message = email.message_from_bytes(file.read())
191
+ text_content = ""
192
+ for part in message.walk():
193
+ if part.get_content_type() == 'text/plain':
194
+ text_content += part.get_payload(decode=True).decode('iso-8859-1')
195
+ # print(text_content)
196
+ return text_content.replace("\n","")
197
+ if text_content == "":
198
+ return get_text_from_html(get_html_general(file_path));
199
+ def get_num_words(file_path):
200
+ if get_text(file_path) != "":
201
+ words = len(get_text(file_path).split())
202
+ return words
203
+ if get_html_general(file_path) != "":
204
+ words = len(get_text_from_html(get_html_general(file_path)).split())
205
+ return words
206
+ else:
207
+ return 0
208
+
209
+ # get how many characters in the email text or html
210
+ def get_num_chars(file_path):
211
+ if get_text(file_path) != "":
212
+ chars = len(get_text(file_path).replace(" ",""))
213
+ return chars
214
+ if get_html_general(file_path) != "":
215
+ chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
216
+ return chars
217
+ else:
218
+ return 0
219
+
220
+ #calculate the body richness by dividing number of words with number of characters
221
+ def get_body_richness(filepath):
222
+ if get_num_chars(filepath) == 0: return 0
223
+ return get_num_words(filepath)/get_num_chars(filepath)
224
+
225
+ #get how many function words is in the content
226
+ def get_num_FunctionWords(file_path):
227
+ function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
228
+ content = ""
229
+ count = 0
230
+ if get_text(file_path) != "":
231
+ content = get_text(file_path).split()
232
+ elif get_html_general(file_path) != "":
233
+ content = get_text_from_html(get_html_general(file_path)).split()
234
+ else:
235
+ return None
236
+ for w in function_words:
237
+ if w in content:
238
+ count += 1
239
+ return count
240
+
241
+
242
+ def get_email_html(file_path):
243
+ with open(file_path, 'rb') as file:
244
+ content = email.message_from_bytes(file.read())
245
+ html_content = ""
246
+ for part in content.walk():
247
+ if part.get_content_type() == 'text/html':
248
+ html_content += part.get_payload(decode=True).decode('iso-8859-1')
249
+ html_content.replace("\n","")
250
+ if html_content != "":
251
+ # print("Found html at "+file_path)
252
+ return html_content
253
+ else:
254
+ # print("No html content found at "+file_path)
255
+ return ""
256
+
257
+ #get how many words in subject
258
+ def get_num_sbj(file_path):
259
+ count = len(get_subject(file_path).split())
260
+ return count
261
+ def get_subject(file_path):
262
+ with open(file_path, 'rb') as file:
263
+ message = email.message_from_bytes(file.read())
264
+ headers = message.items()
265
+ # Print the headers
266
+ subject = ""
267
+ for header in headers:
268
+ if header[0] == "Subject":
269
+ # print(header[1])
270
+ subject = header[1]
271
+ break
272
+ # if subject == "":
273
+ # print("No subject found")
274
+ subject = re.sub(r"\s+", " ", str(subject))
275
+ return subject
276
+
277
+
278
+ def get_sender(file_path):
279
+ with open(file_path, 'rb') as file:
280
+ message = email.message_from_bytes(file.read())
281
+ headers = message.items()
282
+ # Print the headers
283
+ sender = ""
284
+ for header in headers:
285
+ if header[0] == "From":
286
+ # print(header[1])
287
+ sender = header[1]
288
+ break
289
+ if sender == "":
290
+ return None
291
+ # subject = re.sub(r"\s+", " ", str(subject))
292
+ return sender
293
+
294
+ #get how many characters in subject
295
+ def get_num_sbjChar(file_path):
296
+ count = len(get_subject(file_path))
297
+ return count
298
+
299
+ #claculate the subject richness by dividing words with characters
300
+ def get_sbj_richness(file_path):
301
+ if get_num_sbjChar(file_path) == 0:return 0
302
+ return get_num_sbj(file_path)/get_num_sbjChar(file_path)
303
+
304
+ # get how many urls have ip address in it
305
+ def get_num_urls_ip(file_path):
306
+ content = get_html_general(file_path)
307
+ if content == "": return 0
308
+ urls = get_urls_from_html(content)
309
+ num_ip = 0
310
+ for url in urls:
311
+ from urllib.parse import urlparse
312
+ hostname = urlparse(url).hostname
313
+ try:
314
+ ip_address = ipaddress.ip_address(hostname)
315
+ num_ip+=1
316
+ # print(f"{url} contains an IP address: {ip_address}")
317
+ except ValueError:
318
+ pass
319
+ # print(f"{url} does not contain an IP address")
320
+
321
+ return num_ip
322
+
323
+ # return the total amount of urls in html content
324
+ def get_num_urls(file_path):
325
+ urls = get_urls_from_html(get_html_general(file_path))
326
+ if urls == []:
327
+ return None
328
+ return len(urls)
329
+
330
+ # get how many image urls in the html
331
+ def get_num_image_urls(file_path):
332
+ soup = BeautifulSoup(get_html_general(file_path), 'html.parser')
333
+
334
+ # Find all <a> tags that contain an <img> tag
335
+ image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
336
+ image_links_with_img = [link for link in image_links if link.find('img')]
337
+ return len(image_links_with_img)
338
+ # Extract the href and src attributes of each image link
339
+ # for link in image_links_with_img:
340
+ # href = link['href']
341
+ # src = link.find('img')['src']
342
+ # print(f"Clickable image link: {href} - Image URL: {src}")
343
+
344
+ # get numbers of urls contain domain name
345
+ def get_num_domain_urls(file_path):
346
+ urls = get_urls_from_html(get_html_general(file_path))
347
+ domains = set()
348
+ for url in urls:
349
+ match = re.search(r'https?://([^/]+)/', url)
350
+ if match:
351
+ domain = match.group(1)
352
+ domains.add(domain)
353
+
354
+ # Count the number of domains in the set and print the result
355
+ num_domains = len(domains)
356
+ return num_domains
357
+
358
+
359
+ #get how many urls contain port info
360
+ def get_num_url_ports(file_path):
361
+ urls = get_urls_from_html(get_html_general(file_path))
362
+ count = 0
363
+ for url in urls:
364
+ parsed_url = urlparse(url)
365
+ # Check if the parsed URL includes a port number
366
+ if parsed_url.port:
367
+ count += 1
368
+ # print(f'The URL "{url}" contains port {parsed_url.port}')
369
+ # else:
370
+ # print(f'The URL "{url}" does not contain a port')
371
+ return count
372
+
373
+
374
+ #get how many characters in sender
375
+ def get_chars_sender(file_path):
376
+ sender = get_sender(file_path)
377
+ return len(str(sender))
save_models/RF_Num.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2442b46ad908f4a41ce2030e10d3e59b92635396fb95c3a0d85aa74262720ef5
3
+ size 5911369
save_models/RF_extra.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:881e7727b12970a7e473e00944f6bfbf9afd732300ce48af8d714e1ceafcfb06
3
+ size 183913
save_models/SVM_finalcontent.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4541c52b73022168b124d0f115f717e55f50553fe6eea9afccd07524de0e019
3
+ size 4304747
save_models/Stack_tag.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e658df654d3620130d41ac50be3788e29e81b58c6974e1e89d06c59ad14a7f4
3
+ size 7632960
static/css/styles.css ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ background-color: lightblue;
3
+ margin: 0;
4
+ color: black;
5
+
6
+ }
7
+
8
+ h1 { color: #111; font-family: 'Helvetica Neue', sans-serif; font-size: 50px; font-weight: bold; letter-spacing: -1px; line-height: 1; text-align: center; }
9
+
10
+ h2 { color: #111; font-family: 'Open Sans', sans-serif; font-size: 20px; font-weight: 300; line-height: 32px; margin: 0 0 30px; text-align: center; }
11
+
12
+ p { color: #685206; font-family: 'Helvetica Neue', sans-serif; font-size: 15px; line-height: 24px; margin: 0 0 24px; text-align: justify; text-justify: inter-word; }
13
+
14
+ .list {
15
+ max-width: 400px;
16
+ overflow-x: auto;
17
+ list-style: none;
18
+ }
19
+
20
+ .container {
21
+ display: flex;
22
+ }
23
+
24
+ .box {
25
+ border: 5px dashed black;
26
+ width: 500px;
27
+ margin: 50px;
28
+ padding: 10px;
29
+ float: left;
30
+ }
31
+
32
+ .pretty {
33
+ font-family: "Helvetica Neue", Arial, sans-serif;
34
+ font-size: 14px;
35
+ line-height: 1.5;
36
+ text-align: left;
37
+ text-shadow: 1px 1px 1px rgba(0, 0, 0, 0.1);
38
+ text-transform: uppercase;
39
+ letter-spacing: 1px;
40
+ word-spacing: 2px;
41
+ list-style: none;
42
+ }
43
+
44
+ .header {
45
+ padding: 10px;
46
+ text-align: center;
47
+ font-size: 24px;
48
+ border: 5px dashed black;
49
+ }
50
+
51
+
52
+ .button-81 {
53
+ background-color: #fff;
54
+ border: 0 solid #e2e8f0;
55
+ border-radius: 1.5rem;
56
+ box-sizing: border-box;
57
+ color: #0d172a;
58
+ cursor: pointer;
59
+ display: inline-block;
60
+ font-family: "Basier circle",-apple-system,system-ui,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";
61
+ font-size: 1.1rem;
62
+ font-weight: 600;
63
+ line-height: 1;
64
+ padding: 1rem 1.6rem;
65
+ text-align: center;
66
+ text-decoration: none #0d172a solid;
67
+ text-decoration-thickness: auto;
68
+ transition: all .1s cubic-bezier(.4, 0, .2, 1);
69
+ box-shadow: 0px 1px 2px rgba(166, 175, 195, 0.25);
70
+ user-select: none;
71
+ -webkit-user-select: none;
72
+ touch-action: manipulation;
73
+ }
74
+
75
+ .button-81:hover {
76
+ background-color: #1e293b;
77
+ color: #fff;
78
+ }
79
+
80
+ @media (min-width: 768px) {
81
+ .button-81 {
82
+ font-size: 1.125rem;
83
+ padding: 1rem 2rem;
84
+ }
85
+ }
86
+
templates/home.html ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Data Visualization</title>
5
+ <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='css/styles.css') }}">
6
+ </head>
7
+ <body>
8
+ <div class="header">
9
+ <h1>Welcome to Phishing Email Detection!</h1>
10
+ <form method="POST" action="/upload" enctype="multipart/form-data">
11
+ <input type="file" name="file" id="file-input" accept=".txt,.eml">
12
+ <input type="submit" value="Upload" class="button-81">
13
+ </form>
14
+ </div>
15
+ <div class="container">
16
+ <div class="box">
17
+ {% if content %}
18
+ <h2>{{ content }}</h2>
19
+ {% endif %}
20
+ <ul class="list">
21
+ {% for feature in features %}
22
+ <li><pre class="pretty">{{ feature }}: <p>{{ features[feature][0] }}</p></pre></li>
23
+ {% endfor %}
24
+ </ul>
25
+ </div>
26
+ <div class="box">
27
+ <h2>Prediction</h2>
28
+ <ul class="pretty">
29
+ <li>Content prediction: <p>{{ pre_content }}</p></li>
30
+ <li>Html Tag prediction: <p>{{ pre_tag }}</p></li>
31
+ <li>Numeric prediction: <p>{{ pre_num }}</p></li>
32
+ <li>Extra prediction: <p>{{ pre_extra }}</p></li>
33
+ </ul>
34
+ </div>
35
+ </div>
36
+ </body>
37
+ </html>
vectorizer/content_tfidf.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c78b2719d42bf0c36db85c60270770fb6decd878bf2e61cddf13bf2cdee8e19f
3
+ size 4422275
vectorizer/extra_scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3548c38d06e7e04f11df4bcdb29ad7aaeee985af2e3701f4f9d51a79cd7de041
3
+ size 776
vectorizer/html_cv.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84cf833e1bd8975474669746947e93a7bf4b9ec1046f9d8e88d98dc459c860f9
3
+ size 6814
vectorizer/num_scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1a847823b219353781e60d8672e4c2b88720d111dc0a543c3ece441f52ce06f
3
+ size 665