Spaces:
Sleeping
Sleeping
Update parse_email.py
Browse files- parse_email.py +67 -21
parse_email.py
CHANGED
|
@@ -1,50 +1,96 @@
|
|
|
|
|
| 1 |
import email
|
| 2 |
from email import policy
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def parse_email(file_path):
|
|
|
|
|
|
|
|
|
|
| 7 |
with open(file_path, "rb") as f:
|
| 8 |
msg = email.message_from_binary_file(f, policy=policy.default)
|
| 9 |
|
| 10 |
-
# --- 1. Extract headers ---
|
| 11 |
headers = dict(msg.items())
|
|
|
|
| 12 |
|
| 13 |
-
# --- 2. Extract body (text + html) ---
|
| 14 |
body = ""
|
|
|
|
|
|
|
|
|
|
| 15 |
if msg.is_multipart():
|
| 16 |
for part in msg.walk():
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
try:
|
| 20 |
body += part.get_content()
|
| 21 |
-
except:
|
| 22 |
pass
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
try:
|
| 25 |
html_body = part.get_content()
|
|
|
|
|
|
|
|
|
|
| 26 |
soup = BeautifulSoup(html_body, "html.parser")
|
| 27 |
body += soup.get_text(" ", strip=True)
|
| 28 |
-
except:
|
| 29 |
pass
|
| 30 |
else:
|
|
|
|
| 31 |
try:
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
body = ""
|
| 35 |
|
| 36 |
-
#
|
| 37 |
urls = set()
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
urls.add(link["href"])
|
| 47 |
-
except:
|
| 48 |
-
pass
|
| 49 |
|
| 50 |
-
return headers, body, list(urls)
|
|
|
|
| 1 |
+
# parse_email.py
|
| 2 |
import email
|
| 3 |
from email import policy
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import re
|
| 6 |
+
import base64
|
| 7 |
+
import io
|
| 8 |
+
|
| 9 |
+
def _extract_inline_images_from_html(html):
|
| 10 |
+
images = []
|
| 11 |
+
soup = BeautifulSoup(html or "", "html.parser")
|
| 12 |
+
for img in soup.find_all("img"):
|
| 13 |
+
src = img.get("src", "")
|
| 14 |
+
if src.startswith("...
|
| 16 |
+
try:
|
| 17 |
+
header, b64 = src.split(",", 1)
|
| 18 |
+
data = base64.b64decode(b64)
|
| 19 |
+
images.append(data)
|
| 20 |
+
except Exception:
|
| 21 |
+
continue
|
| 22 |
+
return images
|
| 23 |
|
| 24 |
def parse_email(file_path):
|
| 25 |
+
"""
|
| 26 |
+
Returns: headers(dict), subject(str), body(str), urls(list), images(list of bytes)
|
| 27 |
+
"""
|
| 28 |
with open(file_path, "rb") as f:
|
| 29 |
msg = email.message_from_binary_file(f, policy=policy.default)
|
| 30 |
|
|
|
|
| 31 |
headers = dict(msg.items())
|
| 32 |
+
subject = headers.get("Subject", "") or ""
|
| 33 |
|
|
|
|
| 34 |
body = ""
|
| 35 |
+
images = []
|
| 36 |
+
|
| 37 |
+
# Walk parts - handle multipart and attachments
|
| 38 |
if msg.is_multipart():
|
| 39 |
for part in msg.walk():
|
| 40 |
+
ctype = part.get_content_type()
|
| 41 |
+
disp = str(part.get("Content-Disposition") or "").lower()
|
| 42 |
+
# attachments that are images
|
| 43 |
+
if ctype.startswith("image/"):
|
| 44 |
+
try:
|
| 45 |
+
data = part.get_payload(decode=True)
|
| 46 |
+
if data:
|
| 47 |
+
images.append(data)
|
| 48 |
+
except Exception:
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
# text/plain
|
| 52 |
+
if ctype == "text/plain" and "attachment" not in disp:
|
| 53 |
try:
|
| 54 |
body += part.get_content()
|
| 55 |
+
except Exception:
|
| 56 |
pass
|
| 57 |
+
|
| 58 |
+
# text/html
|
| 59 |
+
if ctype == "text/html" and "attachment" not in disp:
|
| 60 |
try:
|
| 61 |
html_body = part.get_content()
|
| 62 |
+
# extract inline images from this html (data URIs)
|
| 63 |
+
images += _extract_inline_images_from_html(html_body)
|
| 64 |
+
# convert html to text
|
| 65 |
soup = BeautifulSoup(html_body, "html.parser")
|
| 66 |
body += soup.get_text(" ", strip=True)
|
| 67 |
+
except Exception:
|
| 68 |
pass
|
| 69 |
else:
|
| 70 |
+
# not multipart
|
| 71 |
try:
|
| 72 |
+
if msg.get_content_type() == "text/html":
|
| 73 |
+
html_body = msg.get_content()
|
| 74 |
+
images += _extract_inline_images_from_html(html_body)
|
| 75 |
+
soup = BeautifulSoup(html_body, "html.parser")
|
| 76 |
+
body = soup.get_text(" ", strip=True)
|
| 77 |
+
else:
|
| 78 |
+
body = msg.get_content()
|
| 79 |
+
except Exception:
|
| 80 |
body = ""
|
| 81 |
|
| 82 |
+
# URL extraction (from combined body)
|
| 83 |
urls = set()
|
| 84 |
+
try:
|
| 85 |
+
urls.update(re.findall(r"https?://[^\s\"'<>]+", body))
|
| 86 |
+
except Exception:
|
| 87 |
+
pass
|
| 88 |
|
| 89 |
+
# Also try to find URLs in headers (e.g., List-Unsubscribe) or other parts
|
| 90 |
+
for k, v in headers.items():
|
| 91 |
+
try:
|
| 92 |
+
urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v)))
|
| 93 |
+
except Exception:
|
| 94 |
+
pass
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
return headers, subject, body, list(urls), images
|