Spaces:
Runtime error
Runtime error
| import ipaddress | |
| from urllib.parse import urlparse | |
| import urllib.request | |
| from bs4 import BeautifulSoup | |
| import re | |
| import email | |
| def get_text_from_html(html_content): | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # extract all the texts | |
| all_text = soup.get_text() | |
| all_text = re.sub(r"\s+", " ", all_text) | |
| # print(all_text) | |
| return all_text | |
| # get text content type from email | |
| def get_text(file): | |
| message = email.message_from_bytes(file.read()) | |
| text_content = "" | |
| for part in message.walk(): | |
| if part.get_content_type() == 'text/plain': | |
| text_content += part.get_payload(decode=True).decode('iso-8859-1') | |
| # print(text_content) | |
| return text_content.replace("\n","") | |
| if text_content == "": | |
| return get_text_from_html(get_html_general(file)); | |
| def get_email_html(file): | |
| content = email.message_from_bytes(file.read()) | |
| html_content = "" | |
| for part in content.walk(): | |
| if part.get_content_type() == 'text/html': | |
| html_content += part.get_payload(decode=True).decode('iso-8859-1') | |
| html_content.replace("\n","") | |
| if html_content != "": | |
| # print("Found html at "+file) | |
| return html_content | |
| else: | |
| # print("No html content found at "+file) | |
| return "" | |
| #get html by searching for <html> tag | |
| def get_html(file): | |
| html_flag = False | |
| html_content = ""; | |
| tag_list = [] | |
| for line in file.read().decode('iso-8859-1'): | |
| words = line.split() | |
| for word in words: | |
| if word == "<html>": | |
| html_flag = True; | |
| if html_flag: | |
| html_content += word | |
| if word == "</html>": | |
| html_flag = False; | |
| # print(html_content) | |
| html_content.replace("\n","") | |
| if html_content == "": | |
| # print("No html content found at "+file) | |
| return "" | |
| else: | |
| # print("Found html at "+file) | |
| return html_content | |
| def get_html_general(file): | |
| if get_email_html(file)!="": | |
| return get_email_html(file) | |
| else: | |
| return get_html(file) | |
| def get_onclicks(file): | |
| content = get_html_general(file) | |
| if content == "": return None | |
| soup = BeautifulSoup(content, 'html.parser') | |
| elements = soup.find_all(attrs={'onClick': True}) | |
| # Count the number of elements with an onClick attribute | |
| count = len(elements) | |
| return count | |
| def check_popWindow(file): | |
| content = get_html_general(file) | |
| if content == "": return None | |
| soup = BeautifulSoup(content, 'html.parser') | |
| # Check if any <script> tags were found | |
| try: | |
| scripts = soup.find_all('script', text=lambda text: 'window.open' in text) | |
| if scripts: | |
| return True | |
| # print('The email body contains a script that attempts to modify the status bar.') | |
| else: | |
| # print('The email body does not contain a script that attempts to modify the status bar.') | |
| return False | |
| except TypeError: | |
| return False | |
| def check_spf(file): | |
| message = email.message_from_bytes(file.read()) | |
| received_spf_header = message.get('Received-SPF') | |
| if received_spf_header == None: | |
| return 0 | |
| if received_spf_header: | |
| spf_result = received_spf_header.split()[0].lower() | |
| if spf_result == 'pass': | |
| return 1 | |
| elif spf_result == 'neutral': | |
| return 2 | |
| elif spf_result == 'softfail': | |
| return 3 | |
| else: | |
| return 0 | |
| else: | |
| return 0 | |
| def check_dkim(file): | |
| message = email.message_from_bytes(file.read()) | |
| auth = message.get('Authentication-Results') | |
| if auth == None: | |
| return 0 | |
| auth_result = auth.split() | |
| # print(auth) | |
| # print(dkim_result) | |
| if 'dkim=pass' in auth_result: | |
| return 1 | |
| else: | |
| return 0 | |
| def check_dmarc(file): | |
| message = email.message_from_bytes(file.read()) | |
| auth = message.get('Authentication-Results') | |
| if auth == None: | |
| return 0 | |
| auth_result = auth.split() | |
| # print(auth) | |
| # print(dkim_result) | |
| if 'dmarc=pass' in auth_result: | |
| return 1 | |
| else: | |
| return 0 | |
| def check_deliver_receiver(file): | |
| message = email.message_from_bytes(file.read()) | |
| deliver = message.get('Delivered-To') | |
| # print(deliver) | |
| receiver = message.get('To') | |
| # print(receiver) | |
| if deliver == receiver: | |
| return 1 | |
| else: | |
| return 0 | |
| def check_encript(file): | |
| message = email.message_from_bytes(file.read()) | |
| received_headers = message.get_all('Received') | |
| # print(received_headers) | |
| version_string = 'version' | |
| try: | |
| for received_header in received_headers: | |
| if version_string in received_header: | |
| return 1 | |
| except TypeError: | |
| return 0 | |
| return 0 | |
| def get_tags_from_html(html_content): | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| tag_list = [] | |
| html_tags = soup.find_all() | |
| for tag in html_tags: | |
| tag_list += [tag.name] | |
| # print(tag_list) | |
| return tag_list | |
| #get urls in html content | |
| def get_urls_from_html(html_content): | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| urls = [] | |
| # get all the urls | |
| anchor_tags = soup.find_all('a') | |
| for tag in anchor_tags: | |
| href = tag.get('href') | |
| if href: | |
| if re.match('^https?://', href): | |
| # print(href) | |
| urls += [href] | |
| return urls | |
| def get_text(file): | |
| message = email.message_from_bytes(file.read()) | |
| text_content = "" | |
| for part in message.walk(): | |
| if part.get_content_type() == 'text/plain': | |
| text_content += part.get_payload(decode=True).decode('iso-8859-1') | |
| # print(text_content) | |
| return text_content.replace("\n","") | |
| if text_content == "": | |
| return get_text_from_html(get_html_general(file)); | |
| def get_num_words(file): | |
| if get_text(file) != "": | |
| words = len(get_text(file).split()) | |
| return words | |
| if get_html_general(file) != "": | |
| words = len(get_text_from_html(get_html_general(file)).split()) | |
| return words | |
| else: | |
| return 0 | |
| # get how many characters in the email text or html | |
| def get_num_chars(file): | |
| if get_text(file) != "": | |
| chars = len(get_text(file).replace(" ","")) | |
| return chars | |
| if get_html_general(file) != "": | |
| chars = len(get_text_from_html(get_html_general(file)).replace(" ","")) | |
| return chars | |
| else: | |
| return 0 | |
| #calculate the body richness by dividing number of words with number of characters | |
| def get_body_richness(file): | |
| if get_num_chars(file) == 0: return 0 | |
| return get_num_words(file)/get_num_chars(file) | |
| #get how many function words is in the content | |
| def get_num_FunctionWords(file): | |
| function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"] | |
| content = "" | |
| count = 0 | |
| if get_text(file) != "": | |
| content = get_text(file).split() | |
| elif get_html_general(file) != "": | |
| content = get_text_from_html(get_html_general(file)).split() | |
| else: | |
| return None | |
| for w in function_words: | |
| if w in content: | |
| count += 1 | |
| return count | |
| def get_email_html(file): | |
| content = email.message_from_bytes(file.read()) | |
| html_content = "" | |
| for part in content.walk(): | |
| if part.get_content_type() == 'text/html': | |
| html_content += part.get_payload(decode=True).decode('iso-8859-1') | |
| html_content.replace("\n","") | |
| if html_content != "": | |
| # print("Found html at "+file) | |
| return html_content | |
| else: | |
| # print("No html content found at "+file) | |
| return "" | |
| #get how many words in subject | |
| def get_num_sbj(file): | |
| count = len(get_subject(file).split()) | |
| return count | |
| def get_subject(file): | |
| message = email.message_from_bytes(file.read()) | |
| headers = message.items() | |
| # Print the headers | |
| subject = "" | |
| for header in headers: | |
| if header[0] == "Subject": | |
| # print(header[1]) | |
| subject = header[1] | |
| break | |
| # if subject == "": | |
| # print("No subject found") | |
| subject = re.sub(r"\s+", " ", str(subject)) | |
| return subject | |
| def get_sender(file): | |
| message = email.message_from_bytes(file.read()) | |
| headers = message.items() | |
| # Print the headers | |
| sender = "" | |
| for header in headers: | |
| if header[0] == "From": | |
| # print(header[1]) | |
| sender = header[1] | |
| break | |
| if sender == "": | |
| return None | |
| # subject = re.sub(r"\s+", " ", str(subject)) | |
| return sender | |
| #get how many characters in subject | |
| def get_num_sbjChar(file): | |
| count = len(get_subject(file)) | |
| return count | |
| #claculate the subject richness by dividing words with characters | |
| def get_sbj_richness(file): | |
| if get_num_sbjChar(file) == 0:return 0 | |
| return get_num_sbj(file)/get_num_sbjChar(file) | |
| # get how many urls have ip address in it | |
| def get_num_urls_ip(file): | |
| content = get_html_general(file) | |
| if content == "": return 0 | |
| urls = get_urls_from_html(content) | |
| num_ip = 0 | |
| for url in urls: | |
| from urllib.parse import urlparse | |
| hostname = urlparse(url).hostname | |
| try: | |
| ip_address = ipaddress.ip_address(hostname) | |
| num_ip+=1 | |
| # print(f"{url} contains an IP address: {ip_address}") | |
| except ValueError: | |
| pass | |
| # print(f"{url} does not contain an IP address") | |
| return num_ip | |
| # return the total amount of urls in html content | |
| def get_num_urls(file): | |
| urls = get_urls_from_html(get_html_general(file)) | |
| if urls == []: | |
| return None | |
| return len(urls) | |
| # get how many image urls in the html | |
| def get_num_image_urls(file): | |
| soup = BeautifulSoup(get_html_general(file), 'html.parser') | |
| # Find all <a> tags that contain an <img> tag | |
| image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None) | |
| image_links_with_img = [link for link in image_links if link.find('img')] | |
| return len(image_links_with_img) | |
| # Extract the href and src attributes of each image link | |
| # for link in image_links_with_img: | |
| # href = link['href'] | |
| # src = link.find('img')['src'] | |
| # print(f"Clickable image link: {href} - Image URL: {src}") | |
| # get numbers of urls contain domain name | |
| def get_num_domain_urls(file): | |
| urls = get_urls_from_html(get_html_general(file)) | |
| domains = set() | |
| for url in urls: | |
| match = re.search(r'https?://([^/]+)/', url) | |
| if match: | |
| domain = match.group(1) | |
| domains.add(domain) | |
| # Count the number of domains in the set and print the result | |
| num_domains = len(domains) | |
| return num_domains | |
| #get how many urls contain port info | |
| def get_num_url_ports(file): | |
| urls = get_urls_from_html(get_html_general(file)) | |
| count = 0 | |
| for url in urls: | |
| parsed_url = urlparse(url) | |
| # Check if the parsed URL includes a port number | |
| if parsed_url.port: | |
| count += 1 | |
| # print(f'The URL "{url}" contains port {parsed_url.port}') | |
| # else: | |
| # print(f'The URL "{url}" does not contain a port') | |
| return count | |
| #get how many characters in sender | |
| def get_chars_sender(file): | |
| sender = get_sender(file) | |
| return len(str(sender)) |