|
def get_text_from_html(html_content): |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
|
all_text = soup.get_text() |
|
all_text = re.sub(r"\s+", " ", all_text) |
|
|
|
return all_text |
|
|
|
def get_text(file_path): |
|
with open(file_path, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
text_content = "" |
|
for part in message.walk(): |
|
if part.get_content_type() == 'text/plain': |
|
text_content += part.get_payload(decode=True).decode('iso-8859-1') |
|
|
|
return text_content.replace("\n","") |
|
if text_content == "": |
|
return get_text_from_html(get_html_general(file_path)); |
|
from bs4 import BeautifulSoup |
|
import email |
|
def get_email_html(file_path): |
|
with open(file_path, 'rb') as file: |
|
content = email.message_from_bytes(file.read()) |
|
html_content = "" |
|
for part in content.walk(): |
|
if part.get_content_type() == 'text/html': |
|
html_content += part.get_payload(decode=True).decode('iso-8859-1') |
|
html_content.replace("\n","") |
|
if html_content != "": |
|
|
|
return html_content |
|
else: |
|
|
|
return "" |
|
|
|
|
|
def get_html(file_path): |
|
with open(file_path, 'r',encoding='iso-8859-1') as file: |
|
html_flag = False |
|
html_content = ""; |
|
tag_list = [] |
|
for line in file: |
|
words = line.split() |
|
for word in words: |
|
if word == "<html>": |
|
html_flag = True; |
|
if html_flag: |
|
html_content += word |
|
if word == "</html>": |
|
html_flag = False; |
|
|
|
html_content.replace("\n","") |
|
if html_content == "": |
|
|
|
return "" |
|
else: |
|
|
|
return html_content |
|
|
|
def get_html_general(file_path): |
|
if get_email_html(file_path)!="": |
|
return get_email_html(file_path) |
|
else: |
|
return get_html(file_path) |
|
def get_onclicks(file_path): |
|
content = get_html_general(file_path) |
|
if content == "": return None |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
elements = soup.find_all(attrs={'onClick': True}) |
|
|
|
count = len(elements) |
|
return count |
|
def check_popWindow(file_path): |
|
content = get_html_general(file_path) |
|
if content == "": return None |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
|
|
try: |
|
scripts = soup.find_all('script', text=lambda text: 'window.open' in text) |
|
if scripts: |
|
return True |
|
|
|
else: |
|
|
|
return False |
|
except TypeError: |
|
return False |
|
|
|
def check_spf(file_path): |
|
with open(file_path, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
received_spf_header = message.get('Received-SPF') |
|
if received_spf_header == None: |
|
return 0 |
|
if received_spf_header: |
|
spf_result = received_spf_header.split()[0].lower() |
|
if spf_result == 'pass': |
|
return 1 |
|
elif spf_result == 'neutral': |
|
return 2 |
|
elif spf_result == 'softfail': |
|
return 3 |
|
else: |
|
return 0 |
|
else: |
|
return 0 |
|
def check_dkim(file_path): |
|
with open(file_path, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
auth = message.get('Authentication-Results') |
|
if auth == None: |
|
return 0 |
|
auth_result = auth.split() |
|
|
|
|
|
if 'dkim=pass' in auth_result: |
|
return 1 |
|
else: |
|
return 0 |
|
def check_dmarc(file_path): |
|
with open(file_path, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
auth = message.get('Authentication-Results') |
|
if auth == None: |
|
return 0 |
|
auth_result = auth.split() |
|
|
|
|
|
if 'dmarc=pass' in auth_result: |
|
return 1 |
|
else: |
|
return 0 |
|
def check_deliver_receiver(filepath): |
|
with open(filepath, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
deliver = message.get('Delivered-To') |
|
|
|
receiver = message.get('To') |
|
|
|
if deliver == receiver: |
|
return 1 |
|
else: |
|
return 0 |
|
def check_encript(filepath): |
|
with open(filepath, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
received_headers = message.get_all('Received') |
|
|
|
version_string = 'version' |
|
try: |
|
for received_header in received_headers: |
|
if version_string in received_header: |
|
return 1 |
|
except TypeError: |
|
return 0 |
|
return 0 |
|
def get_tags_from_html(html_content): |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
tag_list = [] |
|
html_tags = soup.find_all() |
|
for tag in html_tags: |
|
tag_list += [tag.name] |
|
|
|
return tag_list |
|
import ipaddress |
|
from urllib.parse import urlparse |
|
import urllib.request |
|
from bs4 import BeautifulSoup |
|
import re |
|
import email |
|
|
|
|
|
def get_urls_from_html(html_content): |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
urls = [] |
|
|
|
anchor_tags = soup.find_all('a') |
|
for tag in anchor_tags: |
|
href = tag.get('href') |
|
if href: |
|
if re.match('^https?://', href): |
|
|
|
urls += [href] |
|
return urls |
|
def get_text(file_path): |
|
with open(file_path, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
text_content = "" |
|
for part in message.walk(): |
|
if part.get_content_type() == 'text/plain': |
|
text_content += part.get_payload(decode=True).decode('iso-8859-1') |
|
|
|
return text_content.replace("\n","") |
|
if text_content == "": |
|
return get_text_from_html(get_html_general(file_path)); |
|
def get_num_words(file_path): |
|
if get_text(file_path) != "": |
|
words = len(get_text(file_path).split()) |
|
return words |
|
if get_html_general(file_path) != "": |
|
words = len(get_text_from_html(get_html_general(file_path)).split()) |
|
return words |
|
else: |
|
return 0 |
|
|
|
|
|
def get_num_chars(file_path): |
|
if get_text(file_path) != "": |
|
chars = len(get_text(file_path).replace(" ","")) |
|
return chars |
|
if get_html_general(file_path) != "": |
|
chars = len(get_text_from_html(get_html_general(file_path)).replace(" ","")) |
|
return chars |
|
else: |
|
return 0 |
|
|
|
|
|
def get_body_richness(filepath): |
|
if get_num_chars(filepath) == 0: return 0 |
|
return get_num_words(filepath)/get_num_chars(filepath) |
|
|
|
|
|
def get_num_FunctionWords(file_path): |
|
function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"] |
|
content = "" |
|
count = 0 |
|
if get_text(file_path) != "": |
|
content = get_text(file_path).split() |
|
elif get_html_general(file_path) != "": |
|
content = get_text_from_html(get_html_general(file_path)).split() |
|
else: |
|
return None |
|
for w in function_words: |
|
if w in content: |
|
count += 1 |
|
return count |
|
|
|
|
|
def get_email_html(file_path): |
|
with open(file_path, 'rb') as file: |
|
content = email.message_from_bytes(file.read()) |
|
html_content = "" |
|
for part in content.walk(): |
|
if part.get_content_type() == 'text/html': |
|
html_content += part.get_payload(decode=True).decode('iso-8859-1') |
|
html_content.replace("\n","") |
|
if html_content != "": |
|
|
|
return html_content |
|
else: |
|
|
|
return "" |
|
|
|
|
|
def get_num_sbj(file_path): |
|
count = len(get_subject(file_path).split()) |
|
return count |
|
def get_subject(file_path): |
|
with open(file_path, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
headers = message.items() |
|
|
|
subject = "" |
|
for header in headers: |
|
if header[0] == "Subject": |
|
|
|
subject = header[1] |
|
break |
|
|
|
|
|
subject = re.sub(r"\s+", " ", str(subject)) |
|
return subject |
|
|
|
|
|
def get_sender(file_path): |
|
with open(file_path, 'rb') as file: |
|
message = email.message_from_bytes(file.read()) |
|
headers = message.items() |
|
|
|
sender = "" |
|
for header in headers: |
|
if header[0] == "From": |
|
|
|
sender = header[1] |
|
break |
|
if sender == "": |
|
return None |
|
|
|
return sender |
|
|
|
|
|
def get_num_sbjChar(file_path): |
|
count = len(get_subject(file_path)) |
|
return count |
|
|
|
|
|
def get_sbj_richness(file_path): |
|
if get_num_sbjChar(file_path) == 0:return 0 |
|
return get_num_sbj(file_path)/get_num_sbjChar(file_path) |
|
|
|
|
|
def get_num_urls_ip(file_path): |
|
content = get_html_general(file_path) |
|
if content == "": return 0 |
|
urls = get_urls_from_html(content) |
|
num_ip = 0 |
|
for url in urls: |
|
from urllib.parse import urlparse |
|
hostname = urlparse(url).hostname |
|
try: |
|
ip_address = ipaddress.ip_address(hostname) |
|
num_ip+=1 |
|
|
|
except ValueError: |
|
pass |
|
|
|
|
|
return num_ip |
|
|
|
|
|
def get_num_urls(file_path): |
|
urls = get_urls_from_html(get_html_general(file_path)) |
|
if urls == []: |
|
return None |
|
return len(urls) |
|
|
|
|
|
def get_num_image_urls(file_path): |
|
soup = BeautifulSoup(get_html_general(file_path), 'html.parser') |
|
|
|
|
|
image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None) |
|
image_links_with_img = [link for link in image_links if link.find('img')] |
|
return len(image_links_with_img) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_num_domain_urls(file_path): |
|
urls = get_urls_from_html(get_html_general(file_path)) |
|
domains = set() |
|
for url in urls: |
|
match = re.search(r'https?://([^/]+)/', url) |
|
if match: |
|
domain = match.group(1) |
|
domains.add(domain) |
|
|
|
|
|
num_domains = len(domains) |
|
return num_domains |
|
|
|
|
|
|
|
def get_num_url_ports(file_path): |
|
urls = get_urls_from_html(get_html_general(file_path)) |
|
count = 0 |
|
for url in urls: |
|
parsed_url = urlparse(url) |
|
|
|
if parsed_url.port: |
|
count += 1 |
|
|
|
|
|
|
|
return count |
|
|
|
|
|
|
|
def get_chars_sender(file_path): |
|
sender = get_sender(file_path) |
|
return len(str(sender)) |