Spaces:

allenchienxxx
/

PEF

Runtime error

File size: 11,409 Bytes

import ipaddress
from urllib.parse import urlparse
import urllib.request
from bs4 import BeautifulSoup
import re
import email

def get_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # extract all the texts
    all_text = soup.get_text()
    all_text = re.sub(r"\s+", " ", all_text)
    # print(all_text)
    return all_text
# get text content type from email
def get_text(file):
    message = email.message_from_bytes(file.read())
    text_content = ""
    for part in message.walk():
        if part.get_content_type() == 'text/plain':
            text_content += part.get_payload(decode=True).decode('iso-8859-1')
            # print(text_content)
            return text_content.replace("\n","")
    if text_content == "":
        return get_text_from_html(get_html_general(file));

def get_email_html(file):
    content = email.message_from_bytes(file.read())
    html_content = ""
    for part in content.walk():
        if part.get_content_type() == 'text/html':
            html_content += part.get_payload(decode=True).decode('iso-8859-1')
    html_content.replace("\n","")
    if html_content != "":
        # print("Found html at "+file)
        return html_content
    else:
        # print("No html content found at "+file)
        return ""

#get html by searching for <html> tag
def get_html(file):
    html_flag = False
    html_content = "";
    tag_list = []
    for line in file.read().decode('iso-8859-1'):
        words = line.split()
        for word in words:
            if word == "<html>":
                html_flag = True;
            if html_flag:
                html_content += word
            if word == "</html>":
                html_flag = False;
    # print(html_content)
    html_content.replace("\n","")
    if html_content == "":
        # print("No html content found at "+file)
        return ""
    else:
        # print("Found html at "+file)
        return html_content

def get_html_general(file):
    if get_email_html(file)!="":
        return get_email_html(file)
    else:
        return get_html(file)
def get_onclicks(file):
    content = get_html_general(file)
    if content == "": return None
    soup = BeautifulSoup(content, 'html.parser')

    elements = soup.find_all(attrs={'onClick': True})
    # Count the number of elements with an onClick attribute
    count = len(elements)
    return count
def check_popWindow(file):
    content = get_html_general(file)
    if content == "": return None
    soup = BeautifulSoup(content, 'html.parser')

    # Check if any <script> tags were found
    try:
        scripts = soup.find_all('script', text=lambda text: 'window.open' in text)
        if scripts:
            return True
            # print('The email body contains a script that attempts to modify the status bar.')
        else:
            # print('The email body does not contain a script that attempts to modify the status bar.')
            return False
    except TypeError:
        return False

def check_spf(file):
    message = email.message_from_bytes(file.read())
    received_spf_header = message.get('Received-SPF')
    if received_spf_header == None:
      return 0
    if received_spf_header:
        spf_result = received_spf_header.split()[0].lower()
        if spf_result == 'pass':
          return 1
        elif spf_result == 'neutral':
          return 2
        elif spf_result == 'softfail':
          return 3
        else:
          return 0
    else:
        return 0
def check_dkim(file):
    message = email.message_from_bytes(file.read())
    auth = message.get('Authentication-Results')
    if auth == None:
      return 0
    auth_result = auth.split()
    # print(auth)
    # print(dkim_result)
    if 'dkim=pass' in auth_result:
      return 1
    else:
      return 0
def check_dmarc(file):
    message = email.message_from_bytes(file.read())
    auth = message.get('Authentication-Results')
    if auth == None:
      return 0
    auth_result = auth.split()
    # print(auth)
    # print(dkim_result)
    if 'dmarc=pass' in auth_result:
      return 1
    else:
      return 0
def check_deliver_receiver(file):
    message = email.message_from_bytes(file.read())
    deliver = message.get('Delivered-To')
    # print(deliver)
    receiver = message.get('To')
    # print(receiver)
    if deliver == receiver:
      return 1
    else:
      return 0
def check_encript(file):
    message = email.message_from_bytes(file.read())
    received_headers = message.get_all('Received')
    # print(received_headers)
    version_string = 'version'
    try:
      for received_header in received_headers:
          if version_string in received_header:
              return 1
    except TypeError:
      return 0
    return 0
def get_tags_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    tag_list = []
    html_tags = soup.find_all()
    for tag in html_tags:
        tag_list += [tag.name]
    # print(tag_list)
    return tag_list

#get urls in html content
def get_urls_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    urls = []
    # get all the urls
    anchor_tags = soup.find_all('a')
    for tag in anchor_tags:
        href = tag.get('href')
        if href:
            if re.match('^https?://', href):
                # print(href)
                urls += [href]
    return urls
def get_text(file):
        message = email.message_from_bytes(file.read())
        text_content = ""
        for part in message.walk():
            if part.get_content_type() == 'text/plain':
                text_content += part.get_payload(decode=True).decode('iso-8859-1')
                # print(text_content)
                return text_content.replace("\n","")
        if text_content == "":
            return get_text_from_html(get_html_general(file));
def get_num_words(file):
    if get_text(file) != "":
        words = len(get_text(file).split())
        return words
    if get_html_general(file) != "":
        words = len(get_text_from_html(get_html_general(file)).split())
        return words
    else:
        return 0

# get how many characters in the email text or html
def get_num_chars(file):
    if get_text(file) != "":
        chars = len(get_text(file).replace(" ",""))
        return chars
    if get_html_general(file) != "":
        chars = len(get_text_from_html(get_html_general(file)).replace(" ",""))
        return chars
    else:
        return 0

#calculate the body richness by dividing number of words with number of characters
def get_body_richness(file):
    if get_num_chars(file) == 0: return 0
    return get_num_words(file)/get_num_chars(file)

#get how many function words is in the content
def get_num_FunctionWords(file):
    function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
    content = ""
    count = 0
    if get_text(file) != "":
        content = get_text(file).split()
    elif get_html_general(file) != "":
        content = get_text_from_html(get_html_general(file)).split()
    else:
        return None
    for w in function_words:
        if w in content:
            count += 1
    return count


def get_email_html(file):
    content = email.message_from_bytes(file.read())
    html_content = ""
    for part in content.walk():
        if part.get_content_type() == 'text/html':
            html_content += part.get_payload(decode=True).decode('iso-8859-1')
    html_content.replace("\n","")
    if html_content != "":
        # print("Found html at "+file)
        return html_content
    else:
        # print("No html content found at "+file)
        return ""

#get how many words in subject
def get_num_sbj(file):
    count = len(get_subject(file).split())
    return count
def get_subject(file):
    message = email.message_from_bytes(file.read())
    headers = message.items()
    # Print the headers
    subject = ""
    for header in headers:
        if header[0] == "Subject":
            # print(header[1])
            subject = header[1]
            break
    # if subject == "":
        # print("No subject found")
    subject = re.sub(r"\s+", " ", str(subject))
    return subject


def get_sender(file):
    message = email.message_from_bytes(file.read())
    headers = message.items()
    # Print the headers
    sender = ""
    for header in headers:
        if header[0] == "From":
            # print(header[1])
            sender = header[1]
            break
    if sender == "":
        return None
    # subject = re.sub(r"\s+", " ", str(subject))
    return sender

#get how many characters in subject
def get_num_sbjChar(file):
    count = len(get_subject(file))
    return count

#claculate the subject richness by dividing words with characters
def get_sbj_richness(file):
    if get_num_sbjChar(file) == 0:return 0
    return get_num_sbj(file)/get_num_sbjChar(file)

# get how many urls have ip address in it
def get_num_urls_ip(file):
    content = get_html_general(file)
    if content == "": return 0
    urls = get_urls_from_html(content)
    num_ip = 0
    for url in urls:
        from urllib.parse import urlparse
        hostname = urlparse(url).hostname
        try:
            ip_address = ipaddress.ip_address(hostname)
            num_ip+=1
            # print(f"{url} contains an IP address: {ip_address}")
        except ValueError:
            pass
            # print(f"{url} does not contain an IP address")

    return num_ip

# return the total amount of urls in html content
def get_num_urls(file):
    urls = get_urls_from_html(get_html_general(file))
    if urls == []:
        return None
    return len(urls)

# get how many image urls in the html
def get_num_image_urls(file):
    soup = BeautifulSoup(get_html_general(file), 'html.parser')

    # Find all <a> tags that contain an <img> tag
    image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
    image_links_with_img = [link for link in image_links if link.find('img')]
    return len(image_links_with_img)
    # Extract the href and src attributes of each image link
    # for link in image_links_with_img:
    #     href = link['href']
    #     src = link.find('img')['src']
    #     print(f"Clickable image link: {href} - Image URL: {src}")

# get numbers of urls contain domain name
def get_num_domain_urls(file):
    urls = get_urls_from_html(get_html_general(file))
    domains = set()
    for url in urls:
        match = re.search(r'https?://([^/]+)/', url)
        if match:
            domain = match.group(1)
            domains.add(domain)

    # Count the number of domains in the set and print the result
    num_domains = len(domains)
    return num_domains


#get how many urls contain port info
def get_num_url_ports(file):
    urls = get_urls_from_html(get_html_general(file))
    count = 0
    for url in urls:
        parsed_url = urlparse(url)
        # Check if the parsed URL includes a port number
        if parsed_url.port:
            count += 1
        #     print(f'The URL "{url}" contains port {parsed_url.port}')
        # else:
        #     print(f'The URL "{url}" does not contain a port')
    return count


#get how many characters in sender
def get_chars_sender(file):
    sender = get_sender(file)
    return len(str(sender))