Spaces:

BIOML
/

test

Running

File size: 12,556 Bytes

d1b2e47

def get_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # extract all the texts
    all_text = soup.get_text()
    all_text = re.sub(r"\s+", " ", all_text)
    # print(all_text)
    return all_text
# get text content type from email
def get_text(file_path):
    with open(file_path, 'rb') as file:
        message = email.message_from_bytes(file.read())
        text_content = ""
        for part in message.walk():
            if part.get_content_type() == 'text/plain':
                text_content += part.get_payload(decode=True).decode('iso-8859-1')
                # print(text_content)
                return text_content.replace("\n","")
        if text_content == "":
            return get_text_from_html(get_html_general(file_path));
from bs4 import BeautifulSoup
import email
def get_email_html(file_path):
    with open(file_path, 'rb') as file:
        content = email.message_from_bytes(file.read())
        html_content = ""
        for part in content.walk():
            if part.get_content_type() == 'text/html':
                html_content += part.get_payload(decode=True).decode('iso-8859-1')
        html_content.replace("\n","")
        if html_content != "":
            # print("Found html at "+file_path)
            return html_content
        else:
            # print("No html content found at "+file_path)
            return ""

#get html by searching for <html> tag
def get_html(file_path):
    with open(file_path, 'r',encoding='iso-8859-1') as file:
        html_flag = False
        html_content = "";
        tag_list = []
        for line in file:
            words = line.split()
            for word in words:
                if word == "<html>":
                    html_flag = True;
                if html_flag:
                    html_content += word
                if word == "</html>":
                    html_flag = False;
        # print(html_content)
        html_content.replace("\n","")
        if html_content == "":
            # print("No html content found at "+file_path)
            return ""
        else:
            # print("Found html at "+file_path)
            return html_content

def get_html_general(file_path):
    if get_email_html(file_path)!="":
        return get_email_html(file_path)
    else:
        return get_html(file_path)
def get_onclicks(file_path):
    content = get_html_general(file_path)
    if content == "": return None
    soup = BeautifulSoup(content, 'html.parser')

    elements = soup.find_all(attrs={'onClick': True})
    # Count the number of elements with an onClick attribute
    count = len(elements)
    return count
def check_popWindow(file_path):
    content = get_html_general(file_path)
    if content == "": return None
    soup = BeautifulSoup(content, 'html.parser')

    # Check if any <script> tags were found
    try:
        scripts = soup.find_all('script', text=lambda text: 'window.open' in text)
        if scripts:
            return True
            # print('The email body contains a script that attempts to modify the status bar.')
        else:
            # print('The email body does not contain a script that attempts to modify the status bar.')
            return False
    except TypeError:
        return False

def check_spf(file_path):
  with open(file_path, 'rb') as file:
    message = email.message_from_bytes(file.read())
    received_spf_header = message.get('Received-SPF')
    if received_spf_header == None:
      return 0
    if received_spf_header:
        spf_result = received_spf_header.split()[0].lower()
        if spf_result == 'pass':
          return 1
        elif spf_result == 'neutral':
          return 2
        elif spf_result == 'softfail':
          return 3
        else:
          return 0
    else:
        return 0
def check_dkim(file_path):
  with open(file_path, 'rb') as file:
    message = email.message_from_bytes(file.read())
    auth = message.get('Authentication-Results')
    if auth == None:
      return 0
    auth_result = auth.split()
    # print(auth)
    # print(dkim_result)
    if 'dkim=pass' in auth_result:
      return 1
    else:
      return 0
def check_dmarc(file_path):
  with open(file_path, 'rb') as file:
    message = email.message_from_bytes(file.read())
    auth = message.get('Authentication-Results')
    if auth == None:
      return 0
    auth_result = auth.split()
    # print(auth)
    # print(dkim_result)
    if 'dmarc=pass' in auth_result:
      return 1
    else:
      return 0
def check_deliver_receiver(filepath):
  with open(filepath, 'rb') as file:
    message = email.message_from_bytes(file.read())
    deliver = message.get('Delivered-To')
    # print(deliver)
    receiver = message.get('To')
    # print(receiver)
    if deliver == receiver:
      return 1
    else:
      return 0
def check_encript(filepath):
  with open(filepath, 'rb') as file:
    message = email.message_from_bytes(file.read())
    received_headers = message.get_all('Received')
    # print(received_headers)
    version_string = 'version'
    try:
      for received_header in received_headers:
          if version_string in received_header:
              return 1
    except TypeError:
      return 0
    return 0
def get_tags_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    tag_list = []
    html_tags = soup.find_all()
    for tag in html_tags:
        tag_list += [tag.name]
    # print(tag_list)
    return tag_list
import ipaddress
from urllib.parse import urlparse
import urllib.request
from bs4 import BeautifulSoup
import re
import email

#get urls in html content
def get_urls_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    urls = []
    # get all the urls
    anchor_tags = soup.find_all('a')
    for tag in anchor_tags:
        href = tag.get('href')
        if href:
            if re.match('^https?://', href):
                # print(href)
                urls += [href]
    return urls
def get_text(file_path):
    with open(file_path, 'rb') as file:
        message = email.message_from_bytes(file.read())
        text_content = ""
        for part in message.walk():
            if part.get_content_type() == 'text/plain':
                text_content += part.get_payload(decode=True).decode('iso-8859-1')
                # print(text_content)
                return text_content.replace("\n","")
        if text_content == "":
            return get_text_from_html(get_html_general(file_path));
def get_num_words(file_path):
    if get_text(file_path) != "":
        words = len(get_text(file_path).split())
        return words
    if get_html_general(file_path) != "":
        words = len(get_text_from_html(get_html_general(file_path)).split())
        return words
    else:
        return 0

# get how many characters in the email text or html
def get_num_chars(file_path):
    if get_text(file_path) != "":
        chars = len(get_text(file_path).replace(" ",""))
        return chars
    if get_html_general(file_path) != "":
        chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
        return chars
    else:
        return 0

#calculate the body richness by dividing number of words with number of characters
def get_body_richness(filepath):
    if get_num_chars(filepath) == 0: return 0
    return get_num_words(filepath)/get_num_chars(filepath)

#get how many function words is in the content
def get_num_FunctionWords(file_path):
    function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
    content = ""
    count = 0
    if get_text(file_path) != "":
        content = get_text(file_path).split()
    elif get_html_general(file_path) != "":
        content = get_text_from_html(get_html_general(file_path)).split()
    else:
        return None
    for w in function_words:
        if w in content:
            count += 1
    return count


def get_email_html(file_path):
    with open(file_path, 'rb') as file:
        content = email.message_from_bytes(file.read())
        html_content = ""
        for part in content.walk():
            if part.get_content_type() == 'text/html':
                html_content += part.get_payload(decode=True).decode('iso-8859-1')
        html_content.replace("\n","")
        if html_content != "":
            # print("Found html at "+file_path)
            return html_content
        else:
            # print("No html content found at "+file_path)
            return ""

#get how many words in subject
def get_num_sbj(file_path):
    count = len(get_subject(file_path).split())
    return count
def get_subject(file_path):
    with open(file_path, 'rb') as file:
        message = email.message_from_bytes(file.read())
        headers = message.items()
        # Print the headers
        subject = ""
        for header in headers:
            if header[0] == "Subject":
                # print(header[1])
                subject = header[1]
                break
        # if subject == "":
            # print("No subject found")
        subject = re.sub(r"\s+", " ", str(subject))
        return subject


def get_sender(file_path):
    with open(file_path, 'rb') as file:
        message = email.message_from_bytes(file.read())
        headers = message.items()
        # Print the headers
        sender = ""
        for header in headers:
            if header[0] == "From":
                # print(header[1])
                sender = header[1]
                break
        if sender == "":
            return None
        # subject = re.sub(r"\s+", " ", str(subject))
        return sender

#get how many characters in subject
def get_num_sbjChar(file_path):
    count = len(get_subject(file_path))
    return count

#claculate the subject richness by dividing words with characters
def get_sbj_richness(file_path):
    if get_num_sbjChar(file_path) == 0:return 0
    return get_num_sbj(file_path)/get_num_sbjChar(file_path)

# get how many urls have ip address in it
def get_num_urls_ip(file_path):
    content = get_html_general(file_path)
    if content == "": return 0
    urls = get_urls_from_html(content)
    num_ip = 0
    for url in urls:
        from urllib.parse import urlparse
        hostname = urlparse(url).hostname
        try:
            ip_address = ipaddress.ip_address(hostname)
            num_ip+=1
            # print(f"{url} contains an IP address: {ip_address}")
        except ValueError:
            pass
            # print(f"{url} does not contain an IP address")

    return num_ip

# return the total amount of urls in html content
def get_num_urls(file_path):
    urls = get_urls_from_html(get_html_general(file_path))
    if urls == []:
        return None
    return len(urls)

# get how many image urls in the html
def get_num_image_urls(file_path):
    soup = BeautifulSoup(get_html_general(file_path), 'html.parser')

    # Find all <a> tags that contain an <img> tag
    image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
    image_links_with_img = [link for link in image_links if link.find('img')]
    return len(image_links_with_img)
    # Extract the href and src attributes of each image link
    # for link in image_links_with_img:
    #     href = link['href']
    #     src = link.find('img')['src']
    #     print(f"Clickable image link: {href} - Image URL: {src}")

# get numbers of urls contain domain name
def get_num_domain_urls(file_path):
    urls = get_urls_from_html(get_html_general(file_path))
    domains = set()
    for url in urls:
        match = re.search(r'https?://([^/]+)/', url)
        if match:
            domain = match.group(1)
            domains.add(domain)

    # Count the number of domains in the set and print the result
    num_domains = len(domains)
    return num_domains


#get how many urls contain port info
def get_num_url_ports(file_path):
    urls = get_urls_from_html(get_html_general(file_path))
    count = 0
    for url in urls:
        parsed_url = urlparse(url)
        # Check if the parsed URL includes a port number
        if parsed_url.port:
            count += 1
        #     print(f'The URL "{url}" contains port {parsed_url.port}')
        # else:
        #     print(f'The URL "{url}" does not contain a port')
    return count


#get how many characters in sender
def get_chars_sender(file_path):
    sender = get_sender(file_path)
    return len(str(sender))