allenchienxxx commited on
Commit
ae55d84
1 Parent(s): 7aed46a

Update modules.py

Browse files
Files changed (1) hide show
  1. modules.py +20 -21
modules.py CHANGED
@@ -1,3 +1,10 @@
 
 
 
 
 
 
 
1
  def get_text_from_html(html_content):
2
  soup = BeautifulSoup(html_content, 'html.parser')
3
  # extract all the texts
@@ -16,8 +23,7 @@ def get_text(file):
16
  return text_content.replace("\n","")
17
  if text_content == "":
18
  return get_text_from_html(get_html_general(file));
19
- from bs4 import BeautifulSoup
20
- import email
21
  def get_email_html(file):
22
  content = email.message_from_bytes(file.read())
23
  html_content = ""
@@ -157,12 +163,6 @@ def get_tags_from_html(html_content):
157
  tag_list += [tag.name]
158
  # print(tag_list)
159
  return tag_list
160
- import ipaddress
161
- from urllib.parse import urlparse
162
- import urllib.request
163
- from bs4 import BeautifulSoup
164
- import re
165
- import email
166
 
167
  #get urls in html content
168
  def get_urls_from_html(html_content):
@@ -231,19 +231,18 @@ def get_num_FunctionWords(file):
231
 
232
 
233
  def get_email_html(file):
234
- with open(file, 'rb') as file:
235
- content = email.message_from_bytes(file.read())
236
- html_content = ""
237
- for part in content.walk():
238
- if part.get_content_type() == 'text/html':
239
- html_content += part.get_payload(decode=True).decode('iso-8859-1')
240
- html_content.replace("\n","")
241
- if html_content != "":
242
- # print("Found html at "+file)
243
- return html_content
244
- else:
245
- # print("No html content found at "+file)
246
- return ""
247
 
248
  #get how many words in subject
249
  def get_num_sbj(file):
 
1
+ import ipaddress
2
+ from urllib.parse import urlparse
3
+ import urllib.request
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+ import email
7
+
8
  def get_text_from_html(html_content):
9
  soup = BeautifulSoup(html_content, 'html.parser')
10
  # extract all the texts
 
23
  return text_content.replace("\n","")
24
  if text_content == "":
25
  return get_text_from_html(get_html_general(file));
26
+
 
27
  def get_email_html(file):
28
  content = email.message_from_bytes(file.read())
29
  html_content = ""
 
163
  tag_list += [tag.name]
164
  # print(tag_list)
165
  return tag_list
 
 
 
 
 
 
166
 
167
  #get urls in html content
168
  def get_urls_from_html(html_content):
 
231
 
232
 
233
  def get_email_html(file):
234
+ content = email.message_from_bytes(file.read())
235
+ html_content = ""
236
+ for part in content.walk():
237
+ if part.get_content_type() == 'text/html':
238
+ html_content += part.get_payload(decode=True).decode('iso-8859-1')
239
+ html_content.replace("\n","")
240
+ if html_content != "":
241
+ # print("Found html at "+file)
242
+ return html_content
243
+ else:
244
+ # print("No html content found at "+file)
245
+ return ""
 
246
 
247
  #get how many words in subject
248
  def get_num_sbj(file):