Spaces:

terapyon
/

nvdajp-book-qa

Paused

App Files Files Community

nvdajp-book-qa / nvda_ug_loader.py

terapyon

dev/add-resoce (#2)

99d3f35 about 1 year ago

raw

history blame

No virus

2.88 kB

	from dataclasses import dataclass
	import re
	from typing import Iterator, List
	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader

	from bs4 import BeautifulSoup, Tag, ResultSet
	import requests


	RE_HEADERS = re.compile(r"h[23]")


	@dataclass
	class Content:
	name: str
	title: str
	text: str
	body: list[Tag]


	def _get_anchor_name(header: Tag) -> str:
	for tag in header.previous_elements:
	if tag.name == "a":
	return tag.attrs.get("name", "")
	return ""


	def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
	has_anchor = False
	for tag in reversed(body):
	if not has_anchor:
	if tag.name == "a":
	has_anchor = True
	continue
	else:
	yield tag


	def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
	return reversed(list(_reversed_remove_last_anchor(body)))


	def _get_bodys_text(body: list[Tag]) -> str:
	text = ""
	for tag in body:
	text += tag.get_text()
	return text


	def _get_child_content(header: Tag) -> Content:
	title = header.get_text()
	name = _get_anchor_name(header)
	body = [header]
	for i, child in enumerate(header.next_elements):
	if i == 0:
	continue
	if child.name == "h2" or child.name == "h3":
	break
	body.append(child)
	removed_next_anchor_body = list(_remove_last_anchor(body))
	text = _get_bodys_text(removed_next_anchor_body)
	return Content(name,
	title,
	text,
	removed_next_anchor_body
	)


	def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]:
	for header in headers:
	yield _get_child_content(header)


	class NVDAUserGuideLoader(BaseLoader):
	"""
	"""
	def __init__(self, url: str, category: str) -> None:
	self.url = url
	self.category = category

	def fetch(self) -> BeautifulSoup:
	res = requests.get(self.url)
	soup = BeautifulSoup(res.content, 'lxml')
	return soup

	def lazy_load(self) -> Iterator[Document]:
	soup = self.fetch()
	# body = soup.body
	headers = soup.find_all(RE_HEADERS)
	for content in get_contents(headers):
	name = content.name
	title = content.title
	text = content.text
	metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title}
	yield Document(page_content=text, metadata=metadata)

	def load(self) -> List[Document]:
	return list(self.lazy_load())


	if __name__ == "__main__":
	url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html"
	loader = NVDAUserGuideLoader(url, "en-nvda-user-guide")
	data = loader.load()
	print(data)
	# breakpoint()