File size: 12,556 Bytes
d1b2e47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
def get_text_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# extract all the texts
all_text = soup.get_text()
all_text = re.sub(r"\s+", " ", all_text)
# print(all_text)
return all_text
# get text content type from email
def get_text(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
text_content = ""
for part in message.walk():
if part.get_content_type() == 'text/plain':
text_content += part.get_payload(decode=True).decode('iso-8859-1')
# print(text_content)
return text_content.replace("\n","")
if text_content == "":
return get_text_from_html(get_html_general(file_path));
from bs4 import BeautifulSoup
import email
def get_email_html(file_path):
with open(file_path, 'rb') as file:
content = email.message_from_bytes(file.read())
html_content = ""
for part in content.walk():
if part.get_content_type() == 'text/html':
html_content += part.get_payload(decode=True).decode('iso-8859-1')
html_content.replace("\n","")
if html_content != "":
# print("Found html at "+file_path)
return html_content
else:
# print("No html content found at "+file_path)
return ""
#get html by searching for <html> tag
def get_html(file_path):
with open(file_path, 'r',encoding='iso-8859-1') as file:
html_flag = False
html_content = "";
tag_list = []
for line in file:
words = line.split()
for word in words:
if word == "<html>":
html_flag = True;
if html_flag:
html_content += word
if word == "</html>":
html_flag = False;
# print(html_content)
html_content.replace("\n","")
if html_content == "":
# print("No html content found at "+file_path)
return ""
else:
# print("Found html at "+file_path)
return html_content
def get_html_general(file_path):
if get_email_html(file_path)!="":
return get_email_html(file_path)
else:
return get_html(file_path)
def get_onclicks(file_path):
content = get_html_general(file_path)
if content == "": return None
soup = BeautifulSoup(content, 'html.parser')
elements = soup.find_all(attrs={'onClick': True})
# Count the number of elements with an onClick attribute
count = len(elements)
return count
def check_popWindow(file_path):
content = get_html_general(file_path)
if content == "": return None
soup = BeautifulSoup(content, 'html.parser')
# Check if any <script> tags were found
try:
scripts = soup.find_all('script', text=lambda text: 'window.open' in text)
if scripts:
return True
# print('The email body contains a script that attempts to modify the status bar.')
else:
# print('The email body does not contain a script that attempts to modify the status bar.')
return False
except TypeError:
return False
def check_spf(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
received_spf_header = message.get('Received-SPF')
if received_spf_header == None:
return 0
if received_spf_header:
spf_result = received_spf_header.split()[0].lower()
if spf_result == 'pass':
return 1
elif spf_result == 'neutral':
return 2
elif spf_result == 'softfail':
return 3
else:
return 0
else:
return 0
def check_dkim(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
auth = message.get('Authentication-Results')
if auth == None:
return 0
auth_result = auth.split()
# print(auth)
# print(dkim_result)
if 'dkim=pass' in auth_result:
return 1
else:
return 0
def check_dmarc(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
auth = message.get('Authentication-Results')
if auth == None:
return 0
auth_result = auth.split()
# print(auth)
# print(dkim_result)
if 'dmarc=pass' in auth_result:
return 1
else:
return 0
def check_deliver_receiver(filepath):
with open(filepath, 'rb') as file:
message = email.message_from_bytes(file.read())
deliver = message.get('Delivered-To')
# print(deliver)
receiver = message.get('To')
# print(receiver)
if deliver == receiver:
return 1
else:
return 0
def check_encript(filepath):
with open(filepath, 'rb') as file:
message = email.message_from_bytes(file.read())
received_headers = message.get_all('Received')
# print(received_headers)
version_string = 'version'
try:
for received_header in received_headers:
if version_string in received_header:
return 1
except TypeError:
return 0
return 0
def get_tags_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
tag_list = []
html_tags = soup.find_all()
for tag in html_tags:
tag_list += [tag.name]
# print(tag_list)
return tag_list
import ipaddress
from urllib.parse import urlparse
import urllib.request
from bs4 import BeautifulSoup
import re
import email
#get urls in html content
def get_urls_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
urls = []
# get all the urls
anchor_tags = soup.find_all('a')
for tag in anchor_tags:
href = tag.get('href')
if href:
if re.match('^https?://', href):
# print(href)
urls += [href]
return urls
def get_text(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
text_content = ""
for part in message.walk():
if part.get_content_type() == 'text/plain':
text_content += part.get_payload(decode=True).decode('iso-8859-1')
# print(text_content)
return text_content.replace("\n","")
if text_content == "":
return get_text_from_html(get_html_general(file_path));
def get_num_words(file_path):
if get_text(file_path) != "":
words = len(get_text(file_path).split())
return words
if get_html_general(file_path) != "":
words = len(get_text_from_html(get_html_general(file_path)).split())
return words
else:
return 0
# get how many characters in the email text or html
def get_num_chars(file_path):
if get_text(file_path) != "":
chars = len(get_text(file_path).replace(" ",""))
return chars
if get_html_general(file_path) != "":
chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
return chars
else:
return 0
#calculate the body richness by dividing number of words with number of characters
def get_body_richness(filepath):
if get_num_chars(filepath) == 0: return 0
return get_num_words(filepath)/get_num_chars(filepath)
#get how many function words is in the content
def get_num_FunctionWords(file_path):
function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
content = ""
count = 0
if get_text(file_path) != "":
content = get_text(file_path).split()
elif get_html_general(file_path) != "":
content = get_text_from_html(get_html_general(file_path)).split()
else:
return None
for w in function_words:
if w in content:
count += 1
return count
def get_email_html(file_path):
with open(file_path, 'rb') as file:
content = email.message_from_bytes(file.read())
html_content = ""
for part in content.walk():
if part.get_content_type() == 'text/html':
html_content += part.get_payload(decode=True).decode('iso-8859-1')
html_content.replace("\n","")
if html_content != "":
# print("Found html at "+file_path)
return html_content
else:
# print("No html content found at "+file_path)
return ""
#get how many words in subject
def get_num_sbj(file_path):
count = len(get_subject(file_path).split())
return count
def get_subject(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
headers = message.items()
# Print the headers
subject = ""
for header in headers:
if header[0] == "Subject":
# print(header[1])
subject = header[1]
break
# if subject == "":
# print("No subject found")
subject = re.sub(r"\s+", " ", str(subject))
return subject
def get_sender(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
headers = message.items()
# Print the headers
sender = ""
for header in headers:
if header[0] == "From":
# print(header[1])
sender = header[1]
break
if sender == "":
return None
# subject = re.sub(r"\s+", " ", str(subject))
return sender
#get how many characters in subject
def get_num_sbjChar(file_path):
count = len(get_subject(file_path))
return count
#claculate the subject richness by dividing words with characters
def get_sbj_richness(file_path):
if get_num_sbjChar(file_path) == 0:return 0
return get_num_sbj(file_path)/get_num_sbjChar(file_path)
# get how many urls have ip address in it
def get_num_urls_ip(file_path):
content = get_html_general(file_path)
if content == "": return 0
urls = get_urls_from_html(content)
num_ip = 0
for url in urls:
from urllib.parse import urlparse
hostname = urlparse(url).hostname
try:
ip_address = ipaddress.ip_address(hostname)
num_ip+=1
# print(f"{url} contains an IP address: {ip_address}")
except ValueError:
pass
# print(f"{url} does not contain an IP address")
return num_ip
# return the total amount of urls in html content
def get_num_urls(file_path):
urls = get_urls_from_html(get_html_general(file_path))
if urls == []:
return None
return len(urls)
# get how many image urls in the html
def get_num_image_urls(file_path):
soup = BeautifulSoup(get_html_general(file_path), 'html.parser')
# Find all <a> tags that contain an <img> tag
image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
image_links_with_img = [link for link in image_links if link.find('img')]
return len(image_links_with_img)
# Extract the href and src attributes of each image link
# for link in image_links_with_img:
# href = link['href']
# src = link.find('img')['src']
# print(f"Clickable image link: {href} - Image URL: {src}")
# get numbers of urls contain domain name
def get_num_domain_urls(file_path):
urls = get_urls_from_html(get_html_general(file_path))
domains = set()
for url in urls:
match = re.search(r'https?://([^/]+)/', url)
if match:
domain = match.group(1)
domains.add(domain)
# Count the number of domains in the set and print the result
num_domains = len(domains)
return num_domains
#get how many urls contain port info
def get_num_url_ports(file_path):
urls = get_urls_from_html(get_html_general(file_path))
count = 0
for url in urls:
parsed_url = urlparse(url)
# Check if the parsed URL includes a port number
if parsed_url.port:
count += 1
# print(f'The URL "{url}" contains port {parsed_url.port}')
# else:
# print(f'The URL "{url}" does not contain a port')
return count
#get how many characters in sender
def get_chars_sender(file_path):
sender = get_sender(file_path)
return len(str(sender)) |