nvdajp-book-qa / nvda_ug_loader.py
terapyon's picture
added NVDA User guide content and added filter QA
227586c
raw
history blame
2.88 kB
from dataclasses import dataclass
import re
from typing import Iterator, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from bs4 import BeautifulSoup, Tag, ResultSet
import requests
RE_HEADERS = re.compile(r"h[23]")
@dataclass
class Content:
name: str
title: str
text: str
body: list[Tag]
def _get_anchor_name(header: Tag) -> str:
for tag in header.previous_elements:
if tag.name == "a":
return tag.attrs.get("name", "")
return ""
def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
has_anchor = False
for tag in reversed(body):
if not has_anchor:
if tag.name == "a":
has_anchor = True
continue
else:
yield tag
def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
return reversed(list(_reversed_remove_last_anchor(body)))
def _get_bodys_text(body: list[Tag]) -> str:
text = ""
for tag in body:
text += tag.get_text()
return text
def _get_child_content(header: Tag) -> Content:
title = header.get_text()
name = _get_anchor_name(header)
body = [header]
for i, child in enumerate(header.next_elements):
if i == 0:
continue
if child.name == "h2" or child.name == "h3":
break
body.append(child)
removed_next_anchor_body = list(_remove_last_anchor(body))
text = _get_bodys_text(removed_next_anchor_body)
return Content(name,
title,
text,
removed_next_anchor_body
)
def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]:
for header in headers:
yield _get_child_content(header)
class NVDAUserGuideLoader(BaseLoader):
"""
"""
def __init__(self, url: str, category: str) -> None:
self.url = url
self.category = category
def fetch(self) -> BeautifulSoup:
res = requests.get(self.url)
soup = BeautifulSoup(res.content, 'lxml')
return soup
def lazy_load(self) -> Iterator[Document]:
soup = self.fetch()
# body = soup.body
headers = soup.find_all(RE_HEADERS)
for content in get_contents(headers):
name = content.name
title = content.title
text = content.text
metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title}
yield Document(page_content=text, metadata=metadata)
def load(self) -> List[Document]:
return list(self.lazy_load())
if __name__ == "__main__":
url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html"
loader = NVDAUserGuideLoader(url, "en-nvda-user-guide")
data = loader.load()
print(data)
# breakpoint()