Hansimov commited on
Commit
3dda344
1 Parent(s): 62ee9e4

:zap: [Enhance] ignore classes pattern, especially for 163.com

Browse files
documents/webpage_content_extractor.py CHANGED
@@ -34,7 +34,8 @@ class WebpageContentExtractor:
34
 
35
  def remove_elements_from_html(self, html_str):
36
  soup = BeautifulSoup(html_str, "html.parser")
37
- ignore_classes_pattern = f'{"|".join(IGNORE_CLASSES)}'
 
38
  removed_element_counts = 0
39
  for element in soup.find_all():
40
  class_str = ""
 
34
 
35
  def remove_elements_from_html(self, html_str):
36
  soup = BeautifulSoup(html_str, "html.parser")
37
+ ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
38
+ ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
39
  removed_element_counts = 0
40
  for element in soup.find_all():
41
  class_str = ""
networks/network_configs.py CHANGED
@@ -8,7 +8,10 @@ IGNORE_CLASSES = [
8
  # "menu",
9
  "offcanvas",
10
  "navbar",
11
- "post_side",
 
 
 
12
  ]
13
 
14
  IGNORE_HOSTS = [
 
8
  # "menu",
9
  "offcanvas",
10
  "navbar",
11
+ # 163.com
12
+ "post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
13
+ "ntes-.*nav",
14
+ "nav-bottom",
15
  ]
16
 
17
  IGNORE_HOSTS = [