Spaces:

terapyon
/

nvdajp-book-qa

Paused

App Files Files Community

nvdajp-book-qa / nvda_ug_loader.py

terapyon

added NVDA User guide content and added filter QA

227586c 11 months ago

raw history blame

No virus

2.88 kB

	from dataclasses import dataclass
	import re
	from typing import Iterator, List
	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader

	from bs4 import BeautifulSoup, Tag, ResultSet
	import requests


	RE_HEADERS = re.compile(r"h[23]")


	@dataclass
	class Content:
	name: str
	title: str
	text: str
	body: list[Tag]


	def _get_anchor_name(header: Tag) -> str:
	for tag in header.previous_elements:
	if tag.name == "a":
	return tag.attrs.get("name", "")
	return ""


	def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
	has_anchor = False
	for tag in reversed(body):
	if not has_anchor:
	if tag.name == "a":
	has_anchor = True
	continue
	else:
	yield tag


	def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
	return reversed(list(_reversed_remove_last_anchor(body)))


	def _get_bodys_text(body: list[Tag]) -> str:
	text = ""
	for tag in body:
	text += tag.get_text()
	return text


	def _get_child_content(header: Tag) -> Content:
	title = header.get_text()
	name = _get_anchor_name(header)
	body = [header]
	for i, child in enumerate(header.next_elements):
	if i == 0:
	continue
	if child.name == "h2" or child.name == "h3":
	break
	body.append(child)
	removed_next_anchor_body = list(_remove_last_anchor(body))
	text = _get_bodys_text(removed_next_anchor_body)
	return Content(name,
	title,
	text,
	removed_next_anchor_body
	)


	def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]:
	for header in headers:
	yield _get_child_content(header)


	class NVDAUserGuideLoader(BaseLoader):
	"""
	"""
	def __init__(self, url: str, category: str) -> None:
	self.url = url
	self.category = category

	def fetch(self) -> BeautifulSoup:
	res = requests.get(self.url)
	soup = BeautifulSoup(res.content, 'lxml')
	return soup

	def lazy_load(self) -> Iterator[Document]:
	soup = self.fetch()
	# body = soup.body
	headers = soup.find_all(RE_HEADERS)
	for content in get_contents(headers):
	name = content.name
	title = content.title
	text = content.text
	metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title}
	yield Document(page_content=text, metadata=metadata)

	def load(self) -> List[Document]:
	return list(self.lazy_load())


	if __name__ == "__main__":
	url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html"
	loader = NVDAUserGuideLoader(url, "en-nvda-user-guide")
	data = loader.load()
	print(data)
	# breakpoint()