Spaces:

allenchienxxx
/

PEF

Runtime error

App Files Files Community

allenchienxxx commited on Jun 19, 2023

Commit

fd712e2

1 Parent(s): 34daa83

Update modules.py

Browse files

Files changed (1) hide show

modules.py +128 -139

modules.py CHANGED Viewed

@@ -6,65 +6,62 @@ def get_text_from_html(html_content):
     # print(all_text)
     return all_text
 # get text content type from email
-def get_text(file_path):
-    with open(file_path, 'rb') as file:
-        message = email.message_from_bytes(file.read())
-        text_content = ""
-        for part in message.walk():
-            if part.get_content_type() == 'text/plain':
-                text_content += part.get_payload(decode=True).decode('iso-8859-1')
-                # print(text_content)
-                return text_content.replace("\n","")
-        if text_content == "":
-            return get_text_from_html(get_html_general(file_path));
 from bs4 import BeautifulSoup
 import email
-def get_email_html(file_path):
-    with open(file_path, 'rb') as file:
-        content = email.message_from_bytes(file.read())
-        html_content = ""
-        for part in content.walk():
-            if part.get_content_type() == 'text/html':
-                html_content += part.get_payload(decode=True).decode('iso-8859-1')
-        html_content.replace("\n","")
-        if html_content != "":
-            # print("Found html at "+file_path)
-            return html_content
-        else:
-            # print("No html content found at "+file_path)
-            return ""
 #get html by searching for <html> tag
-def get_html(file_path):
-    with open(file_path, 'r',encoding='iso-8859-1') as file:
-        html_flag = False
-        html_content = "";
-        tag_list = []
-        for line in file:
-            words = line.split()
-            for word in words:
-                if word == "<html>":
-                    html_flag = True;
-                if html_flag:
-                    html_content += word
-                if word == "</html>":
-                    html_flag = False;
-        # print(html_content)
-        html_content.replace("\n","")
-        if html_content == "":
-            # print("No html content found at "+file_path)
-            return ""
-        else:
-            # print("Found html at "+file_path)
-            return html_content
-def get_html_general(file_path):
-    if get_email_html(file_path)!="":
-        return get_email_html(file_path)
     else:
-        return get_html(file_path)
-def get_onclicks(file_path):
-    content = get_html_general(file_path)
     if content == "": return None
     soup = BeautifulSoup(content, 'html.parser')
@@ -72,8 +69,8 @@ def get_onclicks(file_path):
     # Count the number of elements with an onClick attribute
     count = len(elements)
     return count
-def check_popWindow(file_path):
-    content = get_html_general(file_path)
     if content == "": return None
     soup = BeautifulSoup(content, 'html.parser')
@@ -89,8 +86,7 @@ def check_popWindow(file_path):
     except TypeError:
         return False
-def check_spf(file_path):
-  with open(file_path, 'rb') as file:
     message = email.message_from_bytes(file.read())
     received_spf_header = message.get('Received-SPF')
     if received_spf_header == None:
@@ -107,8 +103,7 @@ def check_spf(file_path):
           return 0
     else:
         return 0
-def check_dkim(file_path):
-  with open(file_path, 'rb') as file:
     message = email.message_from_bytes(file.read())
     auth = message.get('Authentication-Results')
     if auth == None:
@@ -120,8 +115,7 @@ def check_dkim(file_path):
       return 1
     else:
       return 0
-def check_dmarc(file_path):
-  with open(file_path, 'rb') as file:
     message = email.message_from_bytes(file.read())
     auth = message.get('Authentication-Results')
     if auth == None:
@@ -133,8 +127,7 @@ def check_dmarc(file_path):
       return 1
     else:
       return 0
-def check_deliver_receiver(filepath):
-  with open(filepath, 'rb') as file:
     message = email.message_from_bytes(file.read())
     deliver = message.get('Delivered-To')
     # print(deliver)
@@ -144,8 +137,7 @@ def check_deliver_receiver(filepath):
       return 1
     else:
       return 0
-def check_encript(filepath):
-  with open(filepath, 'rb') as file:
     message = email.message_from_bytes(file.read())
     received_headers = message.get_all('Received')
     # print(received_headers)
@@ -185,8 +177,7 @@ def get_urls_from_html(html_content):
                 # print(href)
                 urls += [href]
     return urls
-def get_text(file_path):
-    with open(file_path, 'rb') as file:
         message = email.message_from_bytes(file.read())
         text_content = ""
         for part in message.walk():
@@ -195,42 +186,42 @@ def get_text(file_path):
                 # print(text_content)
                 return text_content.replace("\n","")
         if text_content == "":
-            return get_text_from_html(get_html_general(file_path));
-def get_num_words(file_path):
-    if get_text(file_path) != "":
-        words = len(get_text(file_path).split())
         return words
-    if get_html_general(file_path) != "":
-        words = len(get_text_from_html(get_html_general(file_path)).split())
         return words
     else:
         return 0
 # get how many characters in the email text or html
-def get_num_chars(file_path):
-    if get_text(file_path) != "":
-        chars = len(get_text(file_path).replace(" ",""))
         return chars
-    if get_html_general(file_path) != "":
-        chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
         return chars
     else:
         return 0
 #calculate the body richness by dividing number of words with number of characters
-def get_body_richness(filepath):
-    if get_num_chars(filepath) == 0: return 0
-    return get_num_words(filepath)/get_num_chars(filepath)
 #get how many function words is in the content
-def get_num_FunctionWords(file_path):
     function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
     content = ""
     count = 0
-    if get_text(file_path) != "":
-        content = get_text(file_path).split()
-    elif get_html_general(file_path) != "":
-        content = get_text_from_html(get_html_general(file_path)).split()
     else:
         return None
     for w in function_words:
@@ -239,8 +230,8 @@ def get_num_FunctionWords(file_path):
     return count
-def get_email_html(file_path):
-    with open(file_path, 'rb') as file:
         content = email.message_from_bytes(file.read())
         html_content = ""
         for part in content.walk():
@@ -248,62 +239,60 @@ def get_email_html(file_path):
                 html_content += part.get_payload(decode=True).decode('iso-8859-1')
         html_content.replace("\n","")
         if html_content != "":
-            # print("Found html at "+file_path)
             return html_content
         else:
-            # print("No html content found at "+file_path)
             return ""
 #get how many words in subject
-def get_num_sbj(file_path):
-    count = len(get_subject(file_path).split())
     return count
-def get_subject(file_path):
-    with open(file_path, 'rb') as file:
-        message = email.message_from_bytes(file.read())
-        headers = message.items()
-        # Print the headers
-        subject = ""
-        for header in headers:
-            if header[0] == "Subject":
-                # print(header[1])
-                subject = header[1]
-                break
-        # if subject == "":
-            # print("No subject found")
-        subject = re.sub(r"\s+", " ", str(subject))
-        return subject
-def get_sender(file_path):
-    with open(file_path, 'rb') as file:
-        message = email.message_from_bytes(file.read())
-        headers = message.items()
-        # Print the headers
-        sender = ""
-        for header in headers:
-            if header[0] == "From":
-                # print(header[1])
-                sender = header[1]
-                break
-        if sender == "":
-            return None
-        # subject = re.sub(r"\s+", " ", str(subject))
-        return sender
 #get how many characters in subject
-def get_num_sbjChar(file_path):
-    count = len(get_subject(file_path))
     return count
 #claculate the subject richness by dividing words with characters
-def get_sbj_richness(file_path):
-    if get_num_sbjChar(file_path) == 0:return 0
-    return get_num_sbj(file_path)/get_num_sbjChar(file_path)
 # get how many urls have ip address in it
-def get_num_urls_ip(file_path):
-    content = get_html_general(file_path)
     if content == "": return 0
     urls = get_urls_from_html(content)
     num_ip = 0
@@ -321,15 +310,15 @@ def get_num_urls_ip(file_path):
     return num_ip
 # return the total amount of urls in html content
-def get_num_urls(file_path):
-    urls = get_urls_from_html(get_html_general(file_path))
     if urls == []:
         return None
     return len(urls)
 # get how many image urls in the html
-def get_num_image_urls(file_path):
-    soup = BeautifulSoup(get_html_general(file_path), 'html.parser')
     # Find all <a> tags that contain an <img> tag
     image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
@@ -342,8 +331,8 @@ def get_num_image_urls(file_path):
     #     print(f"Clickable image link: {href} - Image URL: {src}")
 # get numbers of urls contain domain name
-def get_num_domain_urls(file_path):
-    urls = get_urls_from_html(get_html_general(file_path))
     domains = set()
     for url in urls:
         match = re.search(r'https?://([^/]+)/', url)
@@ -357,8 +346,8 @@ def get_num_domain_urls(file_path):
 #get how many urls contain port info
-def get_num_url_ports(file_path):
-    urls = get_urls_from_html(get_html_general(file_path))
     count = 0
     for url in urls:
         parsed_url = urlparse(url)
@@ -372,6 +361,6 @@ def get_num_url_ports(file_path):
 #get how many characters in sender
-def get_chars_sender(file_path):
-    sender = get_sender(file_path)
     return len(str(sender))

     # print(all_text)
     return all_text
 # get text content type from email
+def get_text(file):
+    message = email.message_from_bytes(file.read())
+    text_content = ""
+    for part in message.walk():
+        if part.get_content_type() == 'text/plain':
+            text_content += part.get_payload(decode=True).decode('iso-8859-1')
+            # print(text_content)
+            return text_content.replace("\n","")
+    if text_content == "":
+        return get_text_from_html(get_html_general(file));
 from bs4 import BeautifulSoup
 import email
+def get_email_html(file):
+    content = email.message_from_bytes(file.read())
+    html_content = ""
+    for part in content.walk():
+        if part.get_content_type() == 'text/html':
+            html_content += part.get_payload(decode=True).decode('iso-8859-1')
+    html_content.replace("\n","")
+    if html_content != "":
+        # print("Found html at "+file)
+        return html_content
+    else:
+        # print("No html content found at "+file)
+        return ""
 #get html by searching for <html> tag
+def get_html(file):
+    html_flag = False
+    html_content = "";
+    tag_list = []
+    for line in file:
+        words = line.split()
+        for word in words:
+            if word == "<html>":
+                html_flag = True;
+            if html_flag:
+                html_content += word
+            if word == "</html>":
+                html_flag = False;
+    # print(html_content)
+    html_content.replace("\n","")
+    if html_content == "":
+        # print("No html content found at "+file)
+        return ""
+    else:
+        # print("Found html at "+file)
+        return html_content
+def get_html_general(file):
+    if get_email_html(file)!="":
+        return get_email_html(file)
     else:
+        return get_html(file)
+def get_onclicks(file):
+    content = get_html_general(file)
     if content == "": return None
     soup = BeautifulSoup(content, 'html.parser')
     # Count the number of elements with an onClick attribute
     count = len(elements)
     return count
+def check_popWindow(file):
+    content = get_html_general(file)
     if content == "": return None
     soup = BeautifulSoup(content, 'html.parser')
     except TypeError:
         return False
+def check_spf(file):
     message = email.message_from_bytes(file.read())
     received_spf_header = message.get('Received-SPF')
     if received_spf_header == None:
           return 0
     else:
         return 0
+def check_dkim(file):
     message = email.message_from_bytes(file.read())
     auth = message.get('Authentication-Results')
     if auth == None:
       return 1
     else:
       return 0
+def check_dmarc(file):
     message = email.message_from_bytes(file.read())
     auth = message.get('Authentication-Results')
     if auth == None:
       return 1
     else:
       return 0
+def check_deliver_receiver(file):
     message = email.message_from_bytes(file.read())
     deliver = message.get('Delivered-To')
     # print(deliver)
       return 1
     else:
       return 0
+def check_encript(file):
     message = email.message_from_bytes(file.read())
     received_headers = message.get_all('Received')
     # print(received_headers)
                 # print(href)
                 urls += [href]
     return urls
+def get_text(file):
         message = email.message_from_bytes(file.read())
         text_content = ""
         for part in message.walk():
                 # print(text_content)
                 return text_content.replace("\n","")
         if text_content == "":
+            return get_text_from_html(get_html_general(file));
+def get_num_words(file):
+    if get_text(file) != "":
+        words = len(get_text(file).split())
         return words
+    if get_html_general(file) != "":
+        words = len(get_text_from_html(get_html_general(file)).split())
         return words
     else:
         return 0
 # get how many characters in the email text or html
+def get_num_chars(file):
+    if get_text(file) != "":
+        chars = len(get_text(file).replace(" ",""))
         return chars
+    if get_html_general(file) != "":
+        chars = len(get_text_from_html(get_html_general(file)).replace(" ",""))
         return chars
     else:
         return 0
 #calculate the body richness by dividing number of words with number of characters
+def get_body_richness(file):
+    if get_num_chars(file) == 0: return 0
+    return get_num_words(file)/get_num_chars(file)
 #get how many function words is in the content
+def get_num_FunctionWords(file):
     function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
     content = ""
     count = 0
+    if get_text(file) != "":
+        content = get_text(file).split()
+    elif get_html_general(file) != "":
+        content = get_text_from_html(get_html_general(file)).split()
     else:
         return None
     for w in function_words:
     return count
+def get_email_html(file):
+    with open(file, 'rb') as file:
         content = email.message_from_bytes(file.read())
         html_content = ""
         for part in content.walk():
                 html_content += part.get_payload(decode=True).decode('iso-8859-1')
         html_content.replace("\n","")
         if html_content != "":
+            # print("Found html at "+file)
             return html_content
         else:
+            # print("No html content found at "+file)
             return ""
 #get how many words in subject
+def get_num_sbj(file):
+    count = len(get_subject(file).split())
     return count
+def get_subject(file):
+    message = email.message_from_bytes(file.read())
+    headers = message.items()
+    # Print the headers
+    subject = ""
+    for header in headers:
+        if header[0] == "Subject":
+            # print(header[1])
+            subject = header[1]
+            break
+    # if subject == "":
+        # print("No subject found")
+    subject = re.sub(r"\s+", " ", str(subject))
+    return subject
+def get_sender(file):
+    message = email.message_from_bytes(file.read())
+    headers = message.items()
+    # Print the headers
+    sender = ""
+    for header in headers:
+        if header[0] == "From":
+            # print(header[1])
+            sender = header[1]
+            break
+    if sender == "":
+        return None
+    # subject = re.sub(r"\s+", " ", str(subject))
+    return sender
 #get how many characters in subject
+def get_num_sbjChar(file):
+    count = len(get_subject(file))
     return count
 #claculate the subject richness by dividing words with characters
+def get_sbj_richness(file):
+    if get_num_sbjChar(file) == 0:return 0
+    return get_num_sbj(file)/get_num_sbjChar(file)
 # get how many urls have ip address in it
+def get_num_urls_ip(file):
+    content = get_html_general(file)
     if content == "": return 0
     urls = get_urls_from_html(content)
     num_ip = 0
     return num_ip
 # return the total amount of urls in html content
+def get_num_urls(file):
+    urls = get_urls_from_html(get_html_general(file))
     if urls == []:
         return None
     return len(urls)
 # get how many image urls in the html
+def get_num_image_urls(file):
+    soup = BeautifulSoup(get_html_general(file), 'html.parser')
     # Find all <a> tags that contain an <img> tag
     image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
     #     print(f"Clickable image link: {href} - Image URL: {src}")
 # get numbers of urls contain domain name
+def get_num_domain_urls(file):
+    urls = get_urls_from_html(get_html_general(file))
     domains = set()
     for url in urls:
         match = re.search(r'https?://([^/]+)/', url)
 #get how many urls contain port info
+def get_num_url_ports(file):
+    urls = get_urls_from_html(get_html_general(file))
     count = 0
     for url in urls:
         parsed_url = urlparse(url)
 #get how many characters in sender
+def get_chars_sender(file):
+    sender = get_sender(file)
     return len(str(sender))