test / modules.py
BIOML's picture
Upload 18 files
d1b2e47
def get_text_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# extract all the texts
all_text = soup.get_text()
all_text = re.sub(r"\s+", " ", all_text)
# print(all_text)
return all_text
# get text content type from email
def get_text(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
text_content = ""
for part in message.walk():
if part.get_content_type() == 'text/plain':
text_content += part.get_payload(decode=True).decode('iso-8859-1')
# print(text_content)
return text_content.replace("\n","")
if text_content == "":
return get_text_from_html(get_html_general(file_path));
from bs4 import BeautifulSoup
import email
def get_email_html(file_path):
with open(file_path, 'rb') as file:
content = email.message_from_bytes(file.read())
html_content = ""
for part in content.walk():
if part.get_content_type() == 'text/html':
html_content += part.get_payload(decode=True).decode('iso-8859-1')
html_content.replace("\n","")
if html_content != "":
# print("Found html at "+file_path)
return html_content
else:
# print("No html content found at "+file_path)
return ""
#get html by searching for <html> tag
def get_html(file_path):
with open(file_path, 'r',encoding='iso-8859-1') as file:
html_flag = False
html_content = "";
tag_list = []
for line in file:
words = line.split()
for word in words:
if word == "<html>":
html_flag = True;
if html_flag:
html_content += word
if word == "</html>":
html_flag = False;
# print(html_content)
html_content.replace("\n","")
if html_content == "":
# print("No html content found at "+file_path)
return ""
else:
# print("Found html at "+file_path)
return html_content
def get_html_general(file_path):
if get_email_html(file_path)!="":
return get_email_html(file_path)
else:
return get_html(file_path)
def get_onclicks(file_path):
content = get_html_general(file_path)
if content == "": return None
soup = BeautifulSoup(content, 'html.parser')
elements = soup.find_all(attrs={'onClick': True})
# Count the number of elements with an onClick attribute
count = len(elements)
return count
def check_popWindow(file_path):
content = get_html_general(file_path)
if content == "": return None
soup = BeautifulSoup(content, 'html.parser')
# Check if any <script> tags were found
try:
scripts = soup.find_all('script', text=lambda text: 'window.open' in text)
if scripts:
return True
# print('The email body contains a script that attempts to modify the status bar.')
else:
# print('The email body does not contain a script that attempts to modify the status bar.')
return False
except TypeError:
return False
def check_spf(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
received_spf_header = message.get('Received-SPF')
if received_spf_header == None:
return 0
if received_spf_header:
spf_result = received_spf_header.split()[0].lower()
if spf_result == 'pass':
return 1
elif spf_result == 'neutral':
return 2
elif spf_result == 'softfail':
return 3
else:
return 0
else:
return 0
def check_dkim(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
auth = message.get('Authentication-Results')
if auth == None:
return 0
auth_result = auth.split()
# print(auth)
# print(dkim_result)
if 'dkim=pass' in auth_result:
return 1
else:
return 0
def check_dmarc(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
auth = message.get('Authentication-Results')
if auth == None:
return 0
auth_result = auth.split()
# print(auth)
# print(dkim_result)
if 'dmarc=pass' in auth_result:
return 1
else:
return 0
def check_deliver_receiver(filepath):
with open(filepath, 'rb') as file:
message = email.message_from_bytes(file.read())
deliver = message.get('Delivered-To')
# print(deliver)
receiver = message.get('To')
# print(receiver)
if deliver == receiver:
return 1
else:
return 0
def check_encript(filepath):
with open(filepath, 'rb') as file:
message = email.message_from_bytes(file.read())
received_headers = message.get_all('Received')
# print(received_headers)
version_string = 'version'
try:
for received_header in received_headers:
if version_string in received_header:
return 1
except TypeError:
return 0
return 0
def get_tags_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
tag_list = []
html_tags = soup.find_all()
for tag in html_tags:
tag_list += [tag.name]
# print(tag_list)
return tag_list
import ipaddress
from urllib.parse import urlparse
import urllib.request
from bs4 import BeautifulSoup
import re
import email
#get urls in html content
def get_urls_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
urls = []
# get all the urls
anchor_tags = soup.find_all('a')
for tag in anchor_tags:
href = tag.get('href')
if href:
if re.match('^https?://', href):
# print(href)
urls += [href]
return urls
def get_text(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
text_content = ""
for part in message.walk():
if part.get_content_type() == 'text/plain':
text_content += part.get_payload(decode=True).decode('iso-8859-1')
# print(text_content)
return text_content.replace("\n","")
if text_content == "":
return get_text_from_html(get_html_general(file_path));
def get_num_words(file_path):
if get_text(file_path) != "":
words = len(get_text(file_path).split())
return words
if get_html_general(file_path) != "":
words = len(get_text_from_html(get_html_general(file_path)).split())
return words
else:
return 0
# get how many characters in the email text or html
def get_num_chars(file_path):
if get_text(file_path) != "":
chars = len(get_text(file_path).replace(" ",""))
return chars
if get_html_general(file_path) != "":
chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
return chars
else:
return 0
#calculate the body richness by dividing number of words with number of characters
def get_body_richness(filepath):
if get_num_chars(filepath) == 0: return 0
return get_num_words(filepath)/get_num_chars(filepath)
#get how many function words is in the content
def get_num_FunctionWords(file_path):
function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
content = ""
count = 0
if get_text(file_path) != "":
content = get_text(file_path).split()
elif get_html_general(file_path) != "":
content = get_text_from_html(get_html_general(file_path)).split()
else:
return None
for w in function_words:
if w in content:
count += 1
return count
def get_email_html(file_path):
with open(file_path, 'rb') as file:
content = email.message_from_bytes(file.read())
html_content = ""
for part in content.walk():
if part.get_content_type() == 'text/html':
html_content += part.get_payload(decode=True).decode('iso-8859-1')
html_content.replace("\n","")
if html_content != "":
# print("Found html at "+file_path)
return html_content
else:
# print("No html content found at "+file_path)
return ""
#get how many words in subject
def get_num_sbj(file_path):
count = len(get_subject(file_path).split())
return count
def get_subject(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
headers = message.items()
# Print the headers
subject = ""
for header in headers:
if header[0] == "Subject":
# print(header[1])
subject = header[1]
break
# if subject == "":
# print("No subject found")
subject = re.sub(r"\s+", " ", str(subject))
return subject
def get_sender(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
headers = message.items()
# Print the headers
sender = ""
for header in headers:
if header[0] == "From":
# print(header[1])
sender = header[1]
break
if sender == "":
return None
# subject = re.sub(r"\s+", " ", str(subject))
return sender
#get how many characters in subject
def get_num_sbjChar(file_path):
count = len(get_subject(file_path))
return count
#claculate the subject richness by dividing words with characters
def get_sbj_richness(file_path):
if get_num_sbjChar(file_path) == 0:return 0
return get_num_sbj(file_path)/get_num_sbjChar(file_path)
# get how many urls have ip address in it
def get_num_urls_ip(file_path):
content = get_html_general(file_path)
if content == "": return 0
urls = get_urls_from_html(content)
num_ip = 0
for url in urls:
from urllib.parse import urlparse
hostname = urlparse(url).hostname
try:
ip_address = ipaddress.ip_address(hostname)
num_ip+=1
# print(f"{url} contains an IP address: {ip_address}")
except ValueError:
pass
# print(f"{url} does not contain an IP address")
return num_ip
# return the total amount of urls in html content
def get_num_urls(file_path):
urls = get_urls_from_html(get_html_general(file_path))
if urls == []:
return None
return len(urls)
# get how many image urls in the html
def get_num_image_urls(file_path):
soup = BeautifulSoup(get_html_general(file_path), 'html.parser')
# Find all <a> tags that contain an <img> tag
image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
image_links_with_img = [link for link in image_links if link.find('img')]
return len(image_links_with_img)
# Extract the href and src attributes of each image link
# for link in image_links_with_img:
# href = link['href']
# src = link.find('img')['src']
# print(f"Clickable image link: {href} - Image URL: {src}")
# get numbers of urls contain domain name
def get_num_domain_urls(file_path):
urls = get_urls_from_html(get_html_general(file_path))
domains = set()
for url in urls:
match = re.search(r'https?://([^/]+)/', url)
if match:
domain = match.group(1)
domains.add(domain)
# Count the number of domains in the set and print the result
num_domains = len(domains)
return num_domains
#get how many urls contain port info
def get_num_url_ports(file_path):
urls = get_urls_from_html(get_html_general(file_path))
count = 0
for url in urls:
parsed_url = urlparse(url)
# Check if the parsed URL includes a port number
if parsed_url.port:
count += 1
# print(f'The URL "{url}" contains port {parsed_url.port}')
# else:
# print(f'The URL "{url}" does not contain a port')
return count
#get how many characters in sender
def get_chars_sender(file_path):
sender = get_sender(file_path)
return len(str(sender))