Hansimov commited on
Commit
cff1afc
1 Parent(s): 0acc824

:boom: [Fix] WebpageContentExtractor: UnicodeDecodeError

Browse files
documents/webpage_content_extractor.py CHANGED
@@ -81,8 +81,17 @@ class WebpageContentExtractor:
81
  logger.warn(f"File not found: {html_path}")
82
  return ""
83
 
84
- with open(html_path, "r", encoding="utf-8") as rf:
85
- html_str = rf.read()
 
 
 
 
 
 
 
 
 
86
 
87
  html_str = self.remove_elements_from_html(html_str)
88
  markdown_str = self.html_to_markdown(html_str)
 
81
  logger.warn(f"File not found: {html_path}")
82
  return ""
83
 
84
+ encodings = ["utf-8", "latin-1"]
85
+ for encoding in encodings:
86
+ try:
87
+ with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
88
+ html_str = rf.read()
89
+ break
90
+ except UnicodeDecodeError:
91
+ pass
92
+ else:
93
+ logger.warn(f"No matching encodings: {html_path}")
94
+ return ""
95
 
96
  html_str = self.remove_elements_from_html(html_str)
97
  markdown_str = self.html_to_markdown(html_str)