princemaxp commited on
Commit
f88bfb7
·
verified ·
1 Parent(s): 36e5719

Update parse_email.py

Browse files
Files changed (1) hide show
  1. parse_email.py +67 -21
parse_email.py CHANGED
@@ -1,50 +1,96 @@
 
1
  import email
2
  from email import policy
3
  from bs4 import BeautifulSoup
4
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def parse_email(file_path):
 
 
 
7
  with open(file_path, "rb") as f:
8
  msg = email.message_from_binary_file(f, policy=policy.default)
9
 
10
- # --- 1. Extract headers ---
11
  headers = dict(msg.items())
 
12
 
13
- # --- 2. Extract body (text + html) ---
14
  body = ""
 
 
 
15
  if msg.is_multipart():
16
  for part in msg.walk():
17
- content_type = part.get_content_type()
18
- if content_type == "text/plain":
 
 
 
 
 
 
 
 
 
 
 
19
  try:
20
  body += part.get_content()
21
- except:
22
  pass
23
- elif content_type == "text/html":
 
 
24
  try:
25
  html_body = part.get_content()
 
 
 
26
  soup = BeautifulSoup(html_body, "html.parser")
27
  body += soup.get_text(" ", strip=True)
28
- except:
29
  pass
30
  else:
 
31
  try:
32
- body = msg.get_content()
33
- except:
 
 
 
 
 
 
34
  body = ""
35
 
36
- # --- 3. Extract URLs ---
37
  urls = set()
38
- urls.update(re.findall(r"https?://[^\s]+", body))
 
 
 
39
 
40
- for part in msg.walk():
41
- if part.get_content_type() == "text/html":
42
- try:
43
- html_body = part.get_content()
44
- soup = BeautifulSoup(html_body, "html.parser")
45
- for link in soup.find_all("a", href=True):
46
- urls.add(link["href"])
47
- except:
48
- pass
49
 
50
- return headers, body, list(urls)
 
1
+ # parse_email.py
2
  import email
3
  from email import policy
4
  from bs4 import BeautifulSoup
5
  import re
6
+ import base64
7
+ import io
8
+
9
+ def _extract_inline_images_from_html(html):
10
+ images = []
11
+ soup = BeautifulSoup(html or "", "html.parser")
12
+ for img in soup.find_all("img"):
13
+ src = img.get("src", "")
14
+ if src.startswith("...
16
+ try:
17
+ header, b64 = src.split(",", 1)
18
+ data = base64.b64decode(b64)
19
+ images.append(data)
20
+ except Exception:
21
+ continue
22
+ return images
23
 
24
  def parse_email(file_path):
25
+ """
26
+ Returns: headers(dict), subject(str), body(str), urls(list), images(list of bytes)
27
+ """
28
  with open(file_path, "rb") as f:
29
  msg = email.message_from_binary_file(f, policy=policy.default)
30
 
 
31
  headers = dict(msg.items())
32
+ subject = headers.get("Subject", "") or ""
33
 
 
34
  body = ""
35
+ images = []
36
+
37
+ # Walk parts - handle multipart and attachments
38
  if msg.is_multipart():
39
  for part in msg.walk():
40
+ ctype = part.get_content_type()
41
+ disp = str(part.get("Content-Disposition") or "").lower()
42
+ # attachments that are images
43
+ if ctype.startswith("image/"):
44
+ try:
45
+ data = part.get_payload(decode=True)
46
+ if data:
47
+ images.append(data)
48
+ except Exception:
49
+ pass
50
+
51
+ # text/plain
52
+ if ctype == "text/plain" and "attachment" not in disp:
53
  try:
54
  body += part.get_content()
55
+ except Exception:
56
  pass
57
+
58
+ # text/html
59
+ if ctype == "text/html" and "attachment" not in disp:
60
  try:
61
  html_body = part.get_content()
62
+ # extract inline images from this html (data URIs)
63
+ images += _extract_inline_images_from_html(html_body)
64
+ # convert html to text
65
  soup = BeautifulSoup(html_body, "html.parser")
66
  body += soup.get_text(" ", strip=True)
67
+ except Exception:
68
  pass
69
  else:
70
+ # not multipart
71
  try:
72
+ if msg.get_content_type() == "text/html":
73
+ html_body = msg.get_content()
74
+ images += _extract_inline_images_from_html(html_body)
75
+ soup = BeautifulSoup(html_body, "html.parser")
76
+ body = soup.get_text(" ", strip=True)
77
+ else:
78
+ body = msg.get_content()
79
+ except Exception:
80
  body = ""
81
 
82
+ # URL extraction (from combined body)
83
  urls = set()
84
+ try:
85
+ urls.update(re.findall(r"https?://[^\s\"'<>]+", body))
86
+ except Exception:
87
+ pass
88
 
89
+ # Also try to find URLs in headers (e.g., List-Unsubscribe) or other parts
90
+ for k, v in headers.items():
91
+ try:
92
+ urls.update(re.findall(r"https?://[^\s\"'<>]+", str(v)))
93
+ except Exception:
94
+ pass
 
 
 
95
 
96
+ return headers, subject, body, list(urls), images