Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Sep 5

Commit

f88bfb7

verified ·

1 Parent(s): 36e5719

Update parse_email.py

Browse files

Files changed (1) hide show

parse_email.py +67 -21

parse_email.py CHANGED Viewed

@@ -1,50 +1,96 @@
 import email
 from email import policy
 from bs4 import BeautifulSoup
 import re
 def parse_email(file_path):
     with open(file_path, "rb") as f:
         msg = email.message_from_binary_file(f, policy=policy.default)
-    # --- 1. Extract headers ---
     headers = dict(msg.items())
-    # --- 2. Extract body (text + html) ---
     body = ""
     if msg.is_multipart():
         for part in msg.walk():
-            content_type = part.get_content_type()
-            if content_type == "text/plain":
                 try:
                     body += part.get_content()
-                except:
                     pass
-            elif content_type == "text/html":
                 try:
                     html_body = part.get_content()
                     soup = BeautifulSoup(html_body, "html.parser")
                     body += soup.get_text(" ", strip=True)
-                except:
                     pass
     else:
         try:
-            body = msg.get_content()
-        except:
             body = ""
-    # --- 3. Extract URLs ---
     urls = set()
-    urls.update(re.findall(r"https?://[^\s]+", body))
-    for part in msg.walk():
-        if part.get_content_type() == "text/html":
-            try:
-                html_body = part.get_content()
-                soup = BeautifulSoup(html_body, "html.parser")
-                for link in soup.find_all("a", href=True):
-                    urls.add(link["href"])
-            except:
-                pass
-    return headers, body, list(urls)

+# parse_email.py
 import email
 from email import policy
 from bs4 import BeautifulSoup
 import re
+import base64
+import io
+def _extract_inline_images_from_html(html):
+    images = []
+    soup = BeautifulSoup(html or "", "html.parser")
+    for img in soup.find_all("img"):
+        src = img.get("src", "")
+        if src.startswith("data:image/"):
+            # e.g. data:image/png;base64,iVBORw0...
+            try:
+                header, b64 = src.split(",", 1)
+                data = base64.b64decode(b64)
+                images.append(data)
+            except Exception:
+                continue
+    return images
 def parse_email(file_path):
+    """
+    Returns: headers(dict), subject(str), body(str), urls(list), images(list of bytes)
+    """
     with open(file_path, "rb") as f:
         msg = email.message_from_binary_file(f, policy=policy.default)
     headers = dict(msg.items())
+    subject = headers.get("Subject", "") or ""
     body = ""
+    images = []
+    # Walk parts - handle multipart and attachments
     if msg.is_multipart():
         for part in msg.walk():
+            ctype = part.get_content_type()
+            disp = str(part.get("Content-Disposition") or "").lower()
+            # attachments that are images
+            if ctype.startswith("image/"):
+                try:
+                    data = part.get_payload(decode=True)
+                    if data:
+                        images.append(data)
+                except Exception:
+                    pass
+            # text/plain
+            if ctype == "text/plain" and "attachment" not in disp:
                 try:
                     body += part.get_content()
+                except Exception:
                     pass
+            # text/html
+            if ctype == "text/html" and "attachment" not in disp:
                 try:
                     html_body = part.get_content()
+                    # extract inline images from this html (data URIs)
+                    images += _extract_inline_images_from_html(html_body)
+                    # convert html to text
                     soup = BeautifulSoup(html_body, "html.parser")
                     body += soup.get_text(" ", strip=True)
+                except Exception:
                     pass
     else:
+        # not multipart
         try:
+            if msg.get_content_type() == "text/html":
+                html_body = msg.get_content()
+                images += _extract_inline_images_from_html(html_body)
+                soup = BeautifulSoup(html_body, "html.parser")
+                body = soup.get_text(" ", strip=True)
+            else:
+                body = msg.get_content()
+        except Exception:
             body = ""
+    # URL extraction (from combined body)
     urls = set()
+    try:
+        urls.update(re.findall(r"https?://[^\s\"'<>]+", body))
+    except Exception:
+        pass
+    # Also try to find URLs in headers (e.g., List-Unsubscribe) or other parts
+    for k, v in headers.items():
+        try:
+            urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v)))
+        except Exception:
+            pass
+    return headers, subject, body, list(urls), images