from dataclasses import dataclass import re from typing import Iterator, List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader from bs4 import BeautifulSoup, Tag, ResultSet import requests RE_HEADERS = re.compile(r"h[23]") @dataclass class Content: name: str title: str text: str body: list[Tag] def _get_anchor_name(header: Tag) -> str: for tag in header.previous_elements: if tag.name == "a": return tag.attrs.get("name", "") return "" def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]: has_anchor = False for tag in reversed(body): if not has_anchor: if tag.name == "a": has_anchor = True continue else: yield tag def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]: return reversed(list(_reversed_remove_last_anchor(body))) def _get_bodys_text(body: list[Tag]) -> str: text = "" for tag in body: text += tag.get_text() return text def _get_child_content(header: Tag) -> Content: title = header.get_text() name = _get_anchor_name(header) body = [header] for i, child in enumerate(header.next_elements): if i == 0: continue if child.name == "h2" or child.name == "h3": break body.append(child) removed_next_anchor_body = list(_remove_last_anchor(body)) text = _get_bodys_text(removed_next_anchor_body) return Content(name, title, text, removed_next_anchor_body ) def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]: for header in headers: yield _get_child_content(header) class NVDAUserGuideLoader(BaseLoader): """ """ def __init__(self, url: str, category: str) -> None: self.url = url self.category = category def fetch(self) -> BeautifulSoup: res = requests.get(self.url) soup = BeautifulSoup(res.content, 'lxml') return soup def lazy_load(self) -> Iterator[Document]: soup = self.fetch() # body = soup.body headers = soup.find_all(RE_HEADERS) for content in get_contents(headers): name = content.name title = content.title text = content.text metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title} yield Document(page_content=text, metadata=metadata) def load(self) -> List[Document]: return list(self.lazy_load()) if __name__ == "__main__": url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html" loader = NVDAUserGuideLoader(url, "en-nvda-user-guide") data = loader.load() print(data) # breakpoint()