Spaces:
Sleeping
Sleeping
:zap: [Enhance] ignore classes pattern, especially for 163.com
Browse files
documents/webpage_content_extractor.py
CHANGED
|
@@ -34,7 +34,8 @@ class WebpageContentExtractor:
|
|
| 34 |
|
| 35 |
def remove_elements_from_html(self, html_str):
|
| 36 |
soup = BeautifulSoup(html_str, "html.parser")
|
| 37 |
-
|
|
|
|
| 38 |
removed_element_counts = 0
|
| 39 |
for element in soup.find_all():
|
| 40 |
class_str = ""
|
|
|
|
| 34 |
|
| 35 |
def remove_elements_from_html(self, html_str):
|
| 36 |
soup = BeautifulSoup(html_str, "html.parser")
|
| 37 |
+
ignore_classes_with_parentheses = [f"({word})" for word in IGNORE_CLASSES]
|
| 38 |
+
ignore_classes_pattern = f'{"|".join(ignore_classes_with_parentheses)}'
|
| 39 |
removed_element_counts = 0
|
| 40 |
for element in soup.find_all():
|
| 41 |
class_str = ""
|
networks/network_configs.py
CHANGED
|
@@ -8,7 +8,10 @@ IGNORE_CLASSES = [
|
|
| 8 |
# "menu",
|
| 9 |
"offcanvas",
|
| 10 |
"navbar",
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
]
|
| 13 |
|
| 14 |
IGNORE_HOSTS = [
|
|
|
|
| 8 |
# "menu",
|
| 9 |
"offcanvas",
|
| 10 |
"navbar",
|
| 11 |
+
# 163.com
|
| 12 |
+
"post_(top)|(side)|(recommends)|(crumb)|(statement)|(next)|(jubao)",
|
| 13 |
+
"ntes-.*nav",
|
| 14 |
+
"nav-bottom",
|
| 15 |
]
|
| 16 |
|
| 17 |
IGNORE_HOSTS = [
|