import ipaddress from urllib.parse import urlparse import urllib.request from bs4 import BeautifulSoup import re import email def get_text_from_html(html_content): soup = BeautifulSoup(html_content, 'html.parser') # extract all the texts all_text = soup.get_text() all_text = re.sub(r"\s+", " ", all_text) # print(all_text) return all_text # get text content type from email def get_text(file): message = email.message_from_bytes(file.read()) text_content = "" for part in message.walk(): if part.get_content_type() == 'text/plain': text_content += part.get_payload(decode=True).decode('iso-8859-1') # print(text_content) return text_content.replace("\n","") if text_content == "": return get_text_from_html(get_html_general(file)); def get_email_html(file): content = email.message_from_bytes(file.read()) html_content = "" for part in content.walk(): if part.get_content_type() == 'text/html': html_content += part.get_payload(decode=True).decode('iso-8859-1') html_content.replace("\n","") if html_content != "": # print("Found html at "+file) return html_content else: # print("No html content found at "+file) return "" #get html by searching for tag def get_html(file): html_flag = False html_content = ""; tag_list = [] for line in file.decode('iso-8859-1'): words = line.split() for word in words: if word == "": html_flag = True; if html_flag: html_content += word if word == "": html_flag = False; # print(html_content) html_content.replace("\n","") if html_content == "": # print("No html content found at "+file) return "" else: # print("Found html at "+file) return html_content def get_html_general(file): if get_email_html(file)!="": return get_email_html(file) else: return get_html(file) def get_onclicks(file): content = get_html_general(file) if content == "": return None soup = BeautifulSoup(content, 'html.parser') elements = soup.find_all(attrs={'onClick': True}) # Count the number of elements with an onClick attribute count = len(elements) return count def check_popWindow(file): content = get_html_general(file) if content == "": return None soup = BeautifulSoup(content, 'html.parser') # Check if any