Spaces:
Sleeping
Sleeping
:zap: [Enhance] ignore classes pattern, especially for 163.com
Browse files
documents/webpage_content_extractor.py
CHANGED
@@ -34,7 +34,8 @@ class WebpageContentExtractor:
|
|
34 |
|
35 |
def remove_elements_from_html(self, html_str):
|
36 |
soup = BeautifulSoup(html_str, "html.parser")
|
37 |
-
|
|
|
38 |
removed_element_counts = 0
|
39 |
for element in soup.find_all():
|
40 |
class_str = ""
|
|
|
34 |
|
35 |
def remove_elements_from_html(self, html_str):
|
36 |
soup = BeautifulSoup(html_str, "html.parser")
|
37 |
+
ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
|
38 |
+
ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
|
39 |
removed_element_counts = 0
|
40 |
for element in soup.find_all():
|
41 |
class_str = ""
|
networks/network_configs.py
CHANGED
@@ -8,7 +8,10 @@ IGNORE_CLASSES = [
|
|
8 |
# "menu",
|
9 |
"offcanvas",
|
10 |
"navbar",
|
11 |
-
|
|
|
|
|
|
|
12 |
]
|
13 |
|
14 |
IGNORE_HOSTS = [
|
|
|
8 |
# "menu",
|
9 |
"offcanvas",
|
10 |
"navbar",
|
11 |
+
# 163.com
|
12 |
+
"post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
|
13 |
+
"ntes-.*nav",
|
14 |
+
"nav-bottom",
|
15 |
]
|
16 |
|
17 |
IGNORE_HOSTS = [
|