allenchienxxx commited on
Commit
fd712e2
1 Parent(s): 34daa83

Update modules.py

Browse files
Files changed (1) hide show
  1. modules.py +128 -139
modules.py CHANGED
@@ -6,65 +6,62 @@ def get_text_from_html(html_content):
6
  # print(all_text)
7
  return all_text
8
  # get text content type from email
9
- def get_text(file_path):
10
- with open(file_path, 'rb') as file:
11
- message = email.message_from_bytes(file.read())
12
- text_content = ""
13
- for part in message.walk():
14
- if part.get_content_type() == 'text/plain':
15
- text_content += part.get_payload(decode=True).decode('iso-8859-1')
16
- # print(text_content)
17
- return text_content.replace("\n","")
18
- if text_content == "":
19
- return get_text_from_html(get_html_general(file_path));
20
  from bs4 import BeautifulSoup
21
  import email
22
- def get_email_html(file_path):
23
- with open(file_path, 'rb') as file:
24
- content = email.message_from_bytes(file.read())
25
- html_content = ""
26
- for part in content.walk():
27
- if part.get_content_type() == 'text/html':
28
- html_content += part.get_payload(decode=True).decode('iso-8859-1')
29
- html_content.replace("\n","")
30
- if html_content != "":
31
- # print("Found html at "+file_path)
32
- return html_content
33
- else:
34
- # print("No html content found at "+file_path)
35
- return ""
36
 
37
  #get html by searching for <html> tag
38
- def get_html(file_path):
39
- with open(file_path, 'r',encoding='iso-8859-1') as file:
40
- html_flag = False
41
- html_content = "";
42
- tag_list = []
43
- for line in file:
44
- words = line.split()
45
- for word in words:
46
- if word == "<html>":
47
- html_flag = True;
48
- if html_flag:
49
- html_content += word
50
- if word == "</html>":
51
- html_flag = False;
52
- # print(html_content)
53
- html_content.replace("\n","")
54
- if html_content == "":
55
- # print("No html content found at "+file_path)
56
- return ""
57
- else:
58
- # print("Found html at "+file_path)
59
- return html_content
60
 
61
- def get_html_general(file_path):
62
- if get_email_html(file_path)!="":
63
- return get_email_html(file_path)
64
  else:
65
- return get_html(file_path)
66
- def get_onclicks(file_path):
67
- content = get_html_general(file_path)
68
  if content == "": return None
69
  soup = BeautifulSoup(content, 'html.parser')
70
 
@@ -72,8 +69,8 @@ def get_onclicks(file_path):
72
  # Count the number of elements with an onClick attribute
73
  count = len(elements)
74
  return count
75
- def check_popWindow(file_path):
76
- content = get_html_general(file_path)
77
  if content == "": return None
78
  soup = BeautifulSoup(content, 'html.parser')
79
 
@@ -89,8 +86,7 @@ def check_popWindow(file_path):
89
  except TypeError:
90
  return False
91
 
92
- def check_spf(file_path):
93
- with open(file_path, 'rb') as file:
94
  message = email.message_from_bytes(file.read())
95
  received_spf_header = message.get('Received-SPF')
96
  if received_spf_header == None:
@@ -107,8 +103,7 @@ def check_spf(file_path):
107
  return 0
108
  else:
109
  return 0
110
- def check_dkim(file_path):
111
- with open(file_path, 'rb') as file:
112
  message = email.message_from_bytes(file.read())
113
  auth = message.get('Authentication-Results')
114
  if auth == None:
@@ -120,8 +115,7 @@ def check_dkim(file_path):
120
  return 1
121
  else:
122
  return 0
123
- def check_dmarc(file_path):
124
- with open(file_path, 'rb') as file:
125
  message = email.message_from_bytes(file.read())
126
  auth = message.get('Authentication-Results')
127
  if auth == None:
@@ -133,8 +127,7 @@ def check_dmarc(file_path):
133
  return 1
134
  else:
135
  return 0
136
- def check_deliver_receiver(filepath):
137
- with open(filepath, 'rb') as file:
138
  message = email.message_from_bytes(file.read())
139
  deliver = message.get('Delivered-To')
140
  # print(deliver)
@@ -144,8 +137,7 @@ def check_deliver_receiver(filepath):
144
  return 1
145
  else:
146
  return 0
147
- def check_encript(filepath):
148
- with open(filepath, 'rb') as file:
149
  message = email.message_from_bytes(file.read())
150
  received_headers = message.get_all('Received')
151
  # print(received_headers)
@@ -185,8 +177,7 @@ def get_urls_from_html(html_content):
185
  # print(href)
186
  urls += [href]
187
  return urls
188
- def get_text(file_path):
189
- with open(file_path, 'rb') as file:
190
  message = email.message_from_bytes(file.read())
191
  text_content = ""
192
  for part in message.walk():
@@ -195,42 +186,42 @@ def get_text(file_path):
195
  # print(text_content)
196
  return text_content.replace("\n","")
197
  if text_content == "":
198
- return get_text_from_html(get_html_general(file_path));
199
- def get_num_words(file_path):
200
- if get_text(file_path) != "":
201
- words = len(get_text(file_path).split())
202
  return words
203
- if get_html_general(file_path) != "":
204
- words = len(get_text_from_html(get_html_general(file_path)).split())
205
  return words
206
  else:
207
  return 0
208
 
209
  # get how many characters in the email text or html
210
- def get_num_chars(file_path):
211
- if get_text(file_path) != "":
212
- chars = len(get_text(file_path).replace(" ",""))
213
  return chars
214
- if get_html_general(file_path) != "":
215
- chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
216
  return chars
217
  else:
218
  return 0
219
 
220
  #calculate the body richness by dividing number of words with number of characters
221
- def get_body_richness(filepath):
222
- if get_num_chars(filepath) == 0: return 0
223
- return get_num_words(filepath)/get_num_chars(filepath)
224
 
225
  #get how many function words is in the content
226
- def get_num_FunctionWords(file_path):
227
  function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
228
  content = ""
229
  count = 0
230
- if get_text(file_path) != "":
231
- content = get_text(file_path).split()
232
- elif get_html_general(file_path) != "":
233
- content = get_text_from_html(get_html_general(file_path)).split()
234
  else:
235
  return None
236
  for w in function_words:
@@ -239,8 +230,8 @@ def get_num_FunctionWords(file_path):
239
  return count
240
 
241
 
242
- def get_email_html(file_path):
243
- with open(file_path, 'rb') as file:
244
  content = email.message_from_bytes(file.read())
245
  html_content = ""
246
  for part in content.walk():
@@ -248,62 +239,60 @@ def get_email_html(file_path):
248
  html_content += part.get_payload(decode=True).decode('iso-8859-1')
249
  html_content.replace("\n","")
250
  if html_content != "":
251
- # print("Found html at "+file_path)
252
  return html_content
253
  else:
254
- # print("No html content found at "+file_path)
255
  return ""
256
 
257
  #get how many words in subject
258
- def get_num_sbj(file_path):
259
- count = len(get_subject(file_path).split())
260
  return count
261
- def get_subject(file_path):
262
- with open(file_path, 'rb') as file:
263
- message = email.message_from_bytes(file.read())
264
- headers = message.items()
265
- # Print the headers
266
- subject = ""
267
- for header in headers:
268
- if header[0] == "Subject":
269
- # print(header[1])
270
- subject = header[1]
271
- break
272
- # if subject == "":
273
- # print("No subject found")
274
- subject = re.sub(r"\s+", " ", str(subject))
275
- return subject
276
 
277
 
278
- def get_sender(file_path):
279
- with open(file_path, 'rb') as file:
280
- message = email.message_from_bytes(file.read())
281
- headers = message.items()
282
- # Print the headers
283
- sender = ""
284
- for header in headers:
285
- if header[0] == "From":
286
- # print(header[1])
287
- sender = header[1]
288
- break
289
- if sender == "":
290
- return None
291
- # subject = re.sub(r"\s+", " ", str(subject))
292
- return sender
293
 
294
  #get how many characters in subject
295
- def get_num_sbjChar(file_path):
296
- count = len(get_subject(file_path))
297
  return count
298
 
299
  #claculate the subject richness by dividing words with characters
300
- def get_sbj_richness(file_path):
301
- if get_num_sbjChar(file_path) == 0:return 0
302
- return get_num_sbj(file_path)/get_num_sbjChar(file_path)
303
 
304
  # get how many urls have ip address in it
305
- def get_num_urls_ip(file_path):
306
- content = get_html_general(file_path)
307
  if content == "": return 0
308
  urls = get_urls_from_html(content)
309
  num_ip = 0
@@ -321,15 +310,15 @@ def get_num_urls_ip(file_path):
321
  return num_ip
322
 
323
  # return the total amount of urls in html content
324
- def get_num_urls(file_path):
325
- urls = get_urls_from_html(get_html_general(file_path))
326
  if urls == []:
327
  return None
328
  return len(urls)
329
 
330
  # get how many image urls in the html
331
- def get_num_image_urls(file_path):
332
- soup = BeautifulSoup(get_html_general(file_path), 'html.parser')
333
 
334
  # Find all <a> tags that contain an <img> tag
335
  image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
@@ -342,8 +331,8 @@ def get_num_image_urls(file_path):
342
  # print(f"Clickable image link: {href} - Image URL: {src}")
343
 
344
  # get numbers of urls contain domain name
345
- def get_num_domain_urls(file_path):
346
- urls = get_urls_from_html(get_html_general(file_path))
347
  domains = set()
348
  for url in urls:
349
  match = re.search(r'https?://([^/]+)/', url)
@@ -357,8 +346,8 @@ def get_num_domain_urls(file_path):
357
 
358
 
359
  #get how many urls contain port info
360
- def get_num_url_ports(file_path):
361
- urls = get_urls_from_html(get_html_general(file_path))
362
  count = 0
363
  for url in urls:
364
  parsed_url = urlparse(url)
@@ -372,6 +361,6 @@ def get_num_url_ports(file_path):
372
 
373
 
374
  #get how many characters in sender
375
- def get_chars_sender(file_path):
376
- sender = get_sender(file_path)
377
  return len(str(sender))
 
6
  # print(all_text)
7
  return all_text
8
  # get text content type from email
9
+ def get_text(file):
10
+ message = email.message_from_bytes(file.read())
11
+ text_content = ""
12
+ for part in message.walk():
13
+ if part.get_content_type() == 'text/plain':
14
+ text_content += part.get_payload(decode=True).decode('iso-8859-1')
15
+ # print(text_content)
16
+ return text_content.replace("\n","")
17
+ if text_content == "":
18
+ return get_text_from_html(get_html_general(file));
 
19
  from bs4 import BeautifulSoup
20
  import email
21
+ def get_email_html(file):
22
+ content = email.message_from_bytes(file.read())
23
+ html_content = ""
24
+ for part in content.walk():
25
+ if part.get_content_type() == 'text/html':
26
+ html_content += part.get_payload(decode=True).decode('iso-8859-1')
27
+ html_content.replace("\n","")
28
+ if html_content != "":
29
+ # print("Found html at "+file)
30
+ return html_content
31
+ else:
32
+ # print("No html content found at "+file)
33
+ return ""
 
34
 
35
  #get html by searching for <html> tag
36
+ def get_html(file):
37
+ html_flag = False
38
+ html_content = "";
39
+ tag_list = []
40
+ for line in file:
41
+ words = line.split()
42
+ for word in words:
43
+ if word == "<html>":
44
+ html_flag = True;
45
+ if html_flag:
46
+ html_content += word
47
+ if word == "</html>":
48
+ html_flag = False;
49
+ # print(html_content)
50
+ html_content.replace("\n","")
51
+ if html_content == "":
52
+ # print("No html content found at "+file)
53
+ return ""
54
+ else:
55
+ # print("Found html at "+file)
56
+ return html_content
 
57
 
58
+ def get_html_general(file):
59
+ if get_email_html(file)!="":
60
+ return get_email_html(file)
61
  else:
62
+ return get_html(file)
63
+ def get_onclicks(file):
64
+ content = get_html_general(file)
65
  if content == "": return None
66
  soup = BeautifulSoup(content, 'html.parser')
67
 
 
69
  # Count the number of elements with an onClick attribute
70
  count = len(elements)
71
  return count
72
+ def check_popWindow(file):
73
+ content = get_html_general(file)
74
  if content == "": return None
75
  soup = BeautifulSoup(content, 'html.parser')
76
 
 
86
  except TypeError:
87
  return False
88
 
89
+ def check_spf(file):
 
90
  message = email.message_from_bytes(file.read())
91
  received_spf_header = message.get('Received-SPF')
92
  if received_spf_header == None:
 
103
  return 0
104
  else:
105
  return 0
106
+ def check_dkim(file):
 
107
  message = email.message_from_bytes(file.read())
108
  auth = message.get('Authentication-Results')
109
  if auth == None:
 
115
  return 1
116
  else:
117
  return 0
118
+ def check_dmarc(file):
 
119
  message = email.message_from_bytes(file.read())
120
  auth = message.get('Authentication-Results')
121
  if auth == None:
 
127
  return 1
128
  else:
129
  return 0
130
+ def check_deliver_receiver(file):
 
131
  message = email.message_from_bytes(file.read())
132
  deliver = message.get('Delivered-To')
133
  # print(deliver)
 
137
  return 1
138
  else:
139
  return 0
140
+ def check_encript(file):
 
141
  message = email.message_from_bytes(file.read())
142
  received_headers = message.get_all('Received')
143
  # print(received_headers)
 
177
  # print(href)
178
  urls += [href]
179
  return urls
180
+ def get_text(file):
 
181
  message = email.message_from_bytes(file.read())
182
  text_content = ""
183
  for part in message.walk():
 
186
  # print(text_content)
187
  return text_content.replace("\n","")
188
  if text_content == "":
189
+ return get_text_from_html(get_html_general(file));
190
+ def get_num_words(file):
191
+ if get_text(file) != "":
192
+ words = len(get_text(file).split())
193
  return words
194
+ if get_html_general(file) != "":
195
+ words = len(get_text_from_html(get_html_general(file)).split())
196
  return words
197
  else:
198
  return 0
199
 
200
  # get how many characters in the email text or html
201
+ def get_num_chars(file):
202
+ if get_text(file) != "":
203
+ chars = len(get_text(file).replace(" ",""))
204
  return chars
205
+ if get_html_general(file) != "":
206
+ chars = len(get_text_from_html(get_html_general(file)).replace(" ",""))
207
  return chars
208
  else:
209
  return 0
210
 
211
  #calculate the body richness by dividing number of words with number of characters
212
+ def get_body_richness(file):
213
+ if get_num_chars(file) == 0: return 0
214
+ return get_num_words(file)/get_num_chars(file)
215
 
216
  #get how many function words is in the content
217
+ def get_num_FunctionWords(file):
218
  function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
219
  content = ""
220
  count = 0
221
+ if get_text(file) != "":
222
+ content = get_text(file).split()
223
+ elif get_html_general(file) != "":
224
+ content = get_text_from_html(get_html_general(file)).split()
225
  else:
226
  return None
227
  for w in function_words:
 
230
  return count
231
 
232
 
233
+ def get_email_html(file):
234
+ with open(file, 'rb') as file:
235
  content = email.message_from_bytes(file.read())
236
  html_content = ""
237
  for part in content.walk():
 
239
  html_content += part.get_payload(decode=True).decode('iso-8859-1')
240
  html_content.replace("\n","")
241
  if html_content != "":
242
+ # print("Found html at "+file)
243
  return html_content
244
  else:
245
+ # print("No html content found at "+file)
246
  return ""
247
 
248
  #get how many words in subject
249
+ def get_num_sbj(file):
250
+ count = len(get_subject(file).split())
251
  return count
252
+ def get_subject(file):
253
+ message = email.message_from_bytes(file.read())
254
+ headers = message.items()
255
+ # Print the headers
256
+ subject = ""
257
+ for header in headers:
258
+ if header[0] == "Subject":
259
+ # print(header[1])
260
+ subject = header[1]
261
+ break
262
+ # if subject == "":
263
+ # print("No subject found")
264
+ subject = re.sub(r"\s+", " ", str(subject))
265
+ return subject
 
266
 
267
 
268
+ def get_sender(file):
269
+ message = email.message_from_bytes(file.read())
270
+ headers = message.items()
271
+ # Print the headers
272
+ sender = ""
273
+ for header in headers:
274
+ if header[0] == "From":
275
+ # print(header[1])
276
+ sender = header[1]
277
+ break
278
+ if sender == "":
279
+ return None
280
+ # subject = re.sub(r"\s+", " ", str(subject))
281
+ return sender
 
282
 
283
  #get how many characters in subject
284
+ def get_num_sbjChar(file):
285
+ count = len(get_subject(file))
286
  return count
287
 
288
  #claculate the subject richness by dividing words with characters
289
+ def get_sbj_richness(file):
290
+ if get_num_sbjChar(file) == 0:return 0
291
+ return get_num_sbj(file)/get_num_sbjChar(file)
292
 
293
  # get how many urls have ip address in it
294
+ def get_num_urls_ip(file):
295
+ content = get_html_general(file)
296
  if content == "": return 0
297
  urls = get_urls_from_html(content)
298
  num_ip = 0
 
310
  return num_ip
311
 
312
  # return the total amount of urls in html content
313
+ def get_num_urls(file):
314
+ urls = get_urls_from_html(get_html_general(file))
315
  if urls == []:
316
  return None
317
  return len(urls)
318
 
319
  # get how many image urls in the html
320
+ def get_num_image_urls(file):
321
+ soup = BeautifulSoup(get_html_general(file), 'html.parser')
322
 
323
  # Find all <a> tags that contain an <img> tag
324
  image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
 
331
  # print(f"Clickable image link: {href} - Image URL: {src}")
332
 
333
  # get numbers of urls contain domain name
334
+ def get_num_domain_urls(file):
335
+ urls = get_urls_from_html(get_html_general(file))
336
  domains = set()
337
  for url in urls:
338
  match = re.search(r'https?://([^/]+)/', url)
 
346
 
347
 
348
  #get how many urls contain port info
349
+ def get_num_url_ports(file):
350
+ urls = get_urls_from_html(get_html_general(file))
351
  count = 0
352
  for url in urls:
353
  parsed_url = urlparse(url)
 
361
 
362
 
363
  #get how many characters in sender
364
+ def get_chars_sender(file):
365
+ sender = get_sender(file)
366
  return len(str(sender))