def get_text_from_html(html_content): soup = BeautifulSoup(html_content, 'html.parser') # extract all the texts all_text = soup.get_text() all_text = re.sub(r"\s+", " ", all_text) # print(all_text) return all_text # get text content type from email def get_text(file_path): with open(file_path, 'rb') as file: message = email.message_from_bytes(file.read()) text_content = "" for part in message.walk(): if part.get_content_type() == 'text/plain': text_content += part.get_payload(decode=True).decode('iso-8859-1') # print(text_content) return text_content.replace("\n","") if text_content == "": return get_text_from_html(get_html_general(file_path)); from bs4 import BeautifulSoup import email def get_email_html(file_path): with open(file_path, 'rb') as file: content = email.message_from_bytes(file.read()) html_content = "" for part in content.walk(): if part.get_content_type() == 'text/html': html_content += part.get_payload(decode=True).decode('iso-8859-1') html_content.replace("\n","") if html_content != "": # print("Found html at "+file_path) return html_content else: # print("No html content found at "+file_path) return "" #get html by searching for tag def get_html(file_path): with open(file_path, 'r',encoding='iso-8859-1') as file: html_flag = False html_content = ""; tag_list = [] for line in file: words = line.split() for word in words: if word == "": html_flag = True; if html_flag: html_content += word if word == "": html_flag = False; # print(html_content) html_content.replace("\n","") if html_content == "": # print("No html content found at "+file_path) return "" else: # print("Found html at "+file_path) return html_content def get_html_general(file_path): if get_email_html(file_path)!="": return get_email_html(file_path) else: return get_html(file_path) def get_onclicks(file_path): content = get_html_general(file_path) if content == "": return None soup = BeautifulSoup(content, 'html.parser') elements = soup.find_all(attrs={'onClick': True}) # Count the number of elements with an onClick attribute count = len(elements) return count def check_popWindow(file_path): content = get_html_general(file_path) if content == "": return None soup = BeautifulSoup(content, 'html.parser') # Check if any