Spaces:
Sleeping
Sleeping
:boom: [Fix] WebpageContentExtractor: UnicodeDecodeError
Browse files
documents/webpage_content_extractor.py
CHANGED
@@ -81,8 +81,17 @@ class WebpageContentExtractor:
|
|
81 |
logger.warn(f"File not found: {html_path}")
|
82 |
return ""
|
83 |
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
html_str = self.remove_elements_from_html(html_str)
|
88 |
markdown_str = self.html_to_markdown(html_str)
|
|
|
81 |
logger.warn(f"File not found: {html_path}")
|
82 |
return ""
|
83 |
|
84 |
+
encodings = ["utf-8", "latin-1"]
|
85 |
+
for encoding in encodings:
|
86 |
+
try:
|
87 |
+
with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
|
88 |
+
html_str = rf.read()
|
89 |
+
break
|
90 |
+
except UnicodeDecodeError:
|
91 |
+
pass
|
92 |
+
else:
|
93 |
+
logger.warn(f"No matching encodings: {html_path}")
|
94 |
+
return ""
|
95 |
|
96 |
html_str = self.remove_elements_from_html(html_str)
|
97 |
markdown_str = self.html_to_markdown(html_str)
|