def get_text_from_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# extract all the texts
all_text = soup.get_text()
all_text = re.sub(r"\s+", " ", all_text)
# print(all_text)
return all_text
# get text content type from email
def get_text(file_path):
with open(file_path, 'rb') as file:
message = email.message_from_bytes(file.read())
text_content = ""
for part in message.walk():
if part.get_content_type() == 'text/plain':
text_content += part.get_payload(decode=True).decode('iso-8859-1')
# print(text_content)
return text_content.replace("\n","")
if text_content == "":
return get_text_from_html(get_html_general(file_path));
from bs4 import BeautifulSoup
import email
def get_email_html(file_path):
with open(file_path, 'rb') as file:
content = email.message_from_bytes(file.read())
html_content = ""
for part in content.walk():
if part.get_content_type() == 'text/html':
html_content += part.get_payload(decode=True).decode('iso-8859-1')
html_content.replace("\n","")
if html_content != "":
# print("Found html at "+file_path)
return html_content
else:
# print("No html content found at "+file_path)
return ""
#get html by searching for tag
def get_html(file_path):
with open(file_path, 'r',encoding='iso-8859-1') as file:
html_flag = False
html_content = "";
tag_list = []
for line in file:
words = line.split()
for word in words:
if word == "":
html_flag = True;
if html_flag:
html_content += word
if word == "":
html_flag = False;
# print(html_content)
html_content.replace("\n","")
if html_content == "":
# print("No html content found at "+file_path)
return ""
else:
# print("Found html at "+file_path)
return html_content
def get_html_general(file_path):
if get_email_html(file_path)!="":
return get_email_html(file_path)
else:
return get_html(file_path)
def get_onclicks(file_path):
content = get_html_general(file_path)
if content == "": return None
soup = BeautifulSoup(content, 'html.parser')
elements = soup.find_all(attrs={'onClick': True})
# Count the number of elements with an onClick attribute
count = len(elements)
return count
def check_popWindow(file_path):
content = get_html_general(file_path)
if content == "": return None
soup = BeautifulSoup(content, 'html.parser')
# Check if any