nvdajp-book-qa / nvda_ug_loader.py
terapyon's picture
dev/add-resoce (#2)
99d3f35
raw history blame
No virus
2.88 kB
from dataclasses import dataclass
import re
from typing import Iterator, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from bs4 import BeautifulSoup, Tag, ResultSet
import requests
RE_HEADERS = re.compile(r"h[23]")
@dataclass
class Content:
name: str
title: str
text: str
body: list[Tag]
def _get_anchor_name(header: Tag) -> str:
for tag in header.previous_elements:
if tag.name == "a":
return tag.attrs.get("name", "")
return ""
def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
has_anchor = False
for tag in reversed(body):
if not has_anchor:
if tag.name == "a":
has_anchor = True
continue
else:
yield tag
def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
return reversed(list(_reversed_remove_last_anchor(body)))
def _get_bodys_text(body: list[Tag]) -> str:
text = ""
for tag in body:
text += tag.get_text()
return text
def _get_child_content(header: Tag) -> Content:
title = header.get_text()
name = _get_anchor_name(header)
body = [header]
for i, child in enumerate(header.next_elements):
if i == 0:
continue
if child.name == "h2" or child.name == "h3":
break
body.append(child)
removed_next_anchor_body = list(_remove_last_anchor(body))
text = _get_bodys_text(removed_next_anchor_body)
return Content(name,
title,
text,
removed_next_anchor_body
)
def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]:
for header in headers:
yield _get_child_content(header)
class NVDAUserGuideLoader(BaseLoader):
"""
"""
def __init__(self, url: str, category: str) -> None:
self.url = url
self.category = category
def fetch(self) -> BeautifulSoup:
res = requests.get(self.url)
soup = BeautifulSoup(res.content, 'lxml')
return soup
def lazy_load(self) -> Iterator[Document]:
soup = self.fetch()
# body = soup.body
headers = soup.find_all(RE_HEADERS)
for content in get_contents(headers):
name = content.name
title = content.title
text = content.text
metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title}
yield Document(page_content=text, metadata=metadata)
def load(self) -> List[Document]:
return list(self.lazy_load())
if __name__ == "__main__":
url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html"
loader = NVDAUserGuideLoader(url, "en-nvda-user-guide")
data = loader.load()
print(data)
# breakpoint()