Spaces:

BIOML
/

test

Running

test / modules.py

Upload 18 files

d1b2e47 about 1 year ago

12.6 kB

	def get_text_from_html(html_content):
	soup = BeautifulSoup(html_content, 'html.parser')
	# extract all the texts
	all_text = soup.get_text()
	all_text = re.sub(r"\s+", " ", all_text)
	# print(all_text)
	return all_text
	# get text content type from email
	def get_text(file_path):
	with open(file_path, 'rb') as file:
	message = email.message_from_bytes(file.read())
	text_content = ""
	for part in message.walk():
	if part.get_content_type() == 'text/plain':
	text_content += part.get_payload(decode=True).decode('iso-8859-1')
	# print(text_content)
	return text_content.replace("\n","")
	if text_content == "":
	return get_text_from_html(get_html_general(file_path));
	from bs4 import BeautifulSoup
	import email
	def get_email_html(file_path):
	with open(file_path, 'rb') as file:
	content = email.message_from_bytes(file.read())
	html_content = ""
	for part in content.walk():
	if part.get_content_type() == 'text/html':
	html_content += part.get_payload(decode=True).decode('iso-8859-1')
	html_content.replace("\n","")
	if html_content != "":
	# print("Found html at "+file_path)
	return html_content
	else:
	# print("No html content found at "+file_path)
	return ""

	#get html by searching for <html> tag
	def get_html(file_path):
	with open(file_path, 'r',encoding='iso-8859-1') as file:
	html_flag = False
	html_content = "";
	tag_list = []
	for line in file:
	words = line.split()
	for word in words:
	if word == "<html>":
	html_flag = True;
	if html_flag:
	html_content += word
	if word == "</html>":
	html_flag = False;
	# print(html_content)
	html_content.replace("\n","")
	if html_content == "":
	# print("No html content found at "+file_path)
	return ""
	else:
	# print("Found html at "+file_path)
	return html_content

	def get_html_general(file_path):
	if get_email_html(file_path)!="":
	return get_email_html(file_path)
	else:
	return get_html(file_path)
	def get_onclicks(file_path):
	content = get_html_general(file_path)
	if content == "": return None
	soup = BeautifulSoup(content, 'html.parser')

	elements = soup.find_all(attrs={'onClick': True})
	# Count the number of elements with an onClick attribute
	count = len(elements)
	return count
	def check_popWindow(file_path):
	content = get_html_general(file_path)
	if content == "": return None
	soup = BeautifulSoup(content, 'html.parser')

	# Check if any <script> tags were found
	try:
	scripts = soup.find_all('script', text=lambda text: 'window.open' in text)
	if scripts:
	return True
	# print('The email body contains a script that attempts to modify the status bar.')
	else:
	# print('The email body does not contain a script that attempts to modify the status bar.')
	return False
	except TypeError:
	return False

	def check_spf(file_path):
	with open(file_path, 'rb') as file:
	message = email.message_from_bytes(file.read())
	received_spf_header = message.get('Received-SPF')
	if received_spf_header == None:
	return 0
	if received_spf_header:
	spf_result = received_spf_header.split()[0].lower()
	if spf_result == 'pass':
	return 1
	elif spf_result == 'neutral':
	return 2
	elif spf_result == 'softfail':
	return 3
	else:
	return 0
	else:
	return 0
	def check_dkim(file_path):
	with open(file_path, 'rb') as file:
	message = email.message_from_bytes(file.read())
	auth = message.get('Authentication-Results')
	if auth == None:
	return 0
	auth_result = auth.split()
	# print(auth)
	# print(dkim_result)
	if 'dkim=pass' in auth_result:
	return 1
	else:
	return 0
	def check_dmarc(file_path):
	with open(file_path, 'rb') as file:
	message = email.message_from_bytes(file.read())
	auth = message.get('Authentication-Results')
	if auth == None:
	return 0
	auth_result = auth.split()
	# print(auth)
	# print(dkim_result)
	if 'dmarc=pass' in auth_result:
	return 1
	else:
	return 0
	def check_deliver_receiver(filepath):
	with open(filepath, 'rb') as file:
	message = email.message_from_bytes(file.read())
	deliver = message.get('Delivered-To')
	# print(deliver)
	receiver = message.get('To')
	# print(receiver)
	if deliver == receiver:
	return 1
	else:
	return 0
	def check_encript(filepath):
	with open(filepath, 'rb') as file:
	message = email.message_from_bytes(file.read())
	received_headers = message.get_all('Received')
	# print(received_headers)
	version_string = 'version'
	try:
	for received_header in received_headers:
	if version_string in received_header:
	return 1
	except TypeError:
	return 0
	return 0
	def get_tags_from_html(html_content):
	soup = BeautifulSoup(html_content, 'html.parser')
	tag_list = []
	html_tags = soup.find_all()
	for tag in html_tags:
	tag_list += [tag.name]
	# print(tag_list)
	return tag_list
	import ipaddress
	from urllib.parse import urlparse
	import urllib.request
	from bs4 import BeautifulSoup
	import re
	import email

	#get urls in html content
	def get_urls_from_html(html_content):
	soup = BeautifulSoup(html_content, 'html.parser')
	urls = []
	# get all the urls
	anchor_tags = soup.find_all('a')
	for tag in anchor_tags:
	href = tag.get('href')
	if href:
	if re.match('^https?://', href):
	# print(href)
	urls += [href]
	return urls
	def get_text(file_path):
	with open(file_path, 'rb') as file:
	message = email.message_from_bytes(file.read())
	text_content = ""
	for part in message.walk():
	if part.get_content_type() == 'text/plain':
	text_content += part.get_payload(decode=True).decode('iso-8859-1')
	# print(text_content)
	return text_content.replace("\n","")
	if text_content == "":
	return get_text_from_html(get_html_general(file_path));
	def get_num_words(file_path):
	if get_text(file_path) != "":
	words = len(get_text(file_path).split())
	return words
	if get_html_general(file_path) != "":
	words = len(get_text_from_html(get_html_general(file_path)).split())
	return words
	else:
	return 0

	# get how many characters in the email text or html
	def get_num_chars(file_path):
	if get_text(file_path) != "":
	chars = len(get_text(file_path).replace(" ",""))
	return chars
	if get_html_general(file_path) != "":
	chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
	return chars
	else:
	return 0

	#calculate the body richness by dividing number of words with number of characters
	def get_body_richness(filepath):
	if get_num_chars(filepath) == 0: return 0
	return get_num_words(filepath)/get_num_chars(filepath)

	#get how many function words is in the content
	def get_num_FunctionWords(file_path):
	function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
	content = ""
	count = 0
	if get_text(file_path) != "":
	content = get_text(file_path).split()
	elif get_html_general(file_path) != "":
	content = get_text_from_html(get_html_general(file_path)).split()
	else:
	return None
	for w in function_words:
	if w in content:
	count += 1
	return count


	def get_email_html(file_path):
	with open(file_path, 'rb') as file:
	content = email.message_from_bytes(file.read())
	html_content = ""
	for part in content.walk():
	if part.get_content_type() == 'text/html':
	html_content += part.get_payload(decode=True).decode('iso-8859-1')
	html_content.replace("\n","")
	if html_content != "":
	# print("Found html at "+file_path)
	return html_content
	else:
	# print("No html content found at "+file_path)
	return ""

	#get how many words in subject
	def get_num_sbj(file_path):
	count = len(get_subject(file_path).split())
	return count
	def get_subject(file_path):
	with open(file_path, 'rb') as file:
	message = email.message_from_bytes(file.read())
	headers = message.items()
	# Print the headers
	subject = ""
	for header in headers:
	if header[0] == "Subject":
	# print(header[1])
	subject = header[1]
	break
	# if subject == "":
	# print("No subject found")
	subject = re.sub(r"\s+", " ", str(subject))
	return subject


	def get_sender(file_path):
	with open(file_path, 'rb') as file:
	message = email.message_from_bytes(file.read())
	headers = message.items()
	# Print the headers
	sender = ""
	for header in headers:
	if header[0] == "From":
	# print(header[1])
	sender = header[1]
	break
	if sender == "":
	return None
	# subject = re.sub(r"\s+", " ", str(subject))
	return sender

	#get how many characters in subject
	def get_num_sbjChar(file_path):
	count = len(get_subject(file_path))
	return count

	#claculate the subject richness by dividing words with characters
	def get_sbj_richness(file_path):
	if get_num_sbjChar(file_path) == 0:return 0
	return get_num_sbj(file_path)/get_num_sbjChar(file_path)

	# get how many urls have ip address in it
	def get_num_urls_ip(file_path):
	content = get_html_general(file_path)
	if content == "": return 0
	urls = get_urls_from_html(content)
	num_ip = 0
	for url in urls:
	from urllib.parse import urlparse
	hostname = urlparse(url).hostname
	try:
	ip_address = ipaddress.ip_address(hostname)
	num_ip+=1
	# print(f"{url} contains an IP address: {ip_address}")
	except ValueError:
	pass
	# print(f"{url} does not contain an IP address")

	return num_ip

	# return the total amount of urls in html content
	def get_num_urls(file_path):
	urls = get_urls_from_html(get_html_general(file_path))
	if urls == []:
	return None
	return len(urls)

	# get how many image urls in the html
	def get_num_image_urls(file_path):
	soup = BeautifulSoup(get_html_general(file_path), 'html.parser')

	# Find all <a> tags that contain an <img> tag
	image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
	image_links_with_img = [link for link in image_links if link.find('img')]
	return len(image_links_with_img)
	# Extract the href and src attributes of each image link
	# for link in image_links_with_img:
	# href = link['href']
	# src = link.find('img')['src']
	# print(f"Clickable image link: {href} - Image URL: {src}")

	# get numbers of urls contain domain name
	def get_num_domain_urls(file_path):
	urls = get_urls_from_html(get_html_general(file_path))
	domains = set()
	for url in urls:
	match = re.search(r'https?://([^/]+)/', url)
	if match:
	domain = match.group(1)
	domains.add(domain)

	# Count the number of domains in the set and print the result
	num_domains = len(domains)
	return num_domains


	#get how many urls contain port info
	def get_num_url_ports(file_path):
	urls = get_urls_from_html(get_html_general(file_path))
	count = 0
	for url in urls:
	parsed_url = urlparse(url)
	# Check if the parsed URL includes a port number
	if parsed_url.port:
	count += 1
	# print(f'The URL "{url}" contains port {parsed_url.port}')
	# else:
	# print(f'The URL "{url}" does not contain a port')
	return count


	#get how many characters in sender
	def get_chars_sender(file_path):
	sender = get_sender(file_path)
	return len(str(sender))