File size: 2,875 Bytes
99d3f35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from dataclasses import dataclass
import re
from typing import Iterator, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

from bs4 import BeautifulSoup, Tag, ResultSet
import requests


RE_HEADERS = re.compile(r"h[23]")


@dataclass
class Content:
    name: str
    title: str
    text: str
    body: list[Tag]


def _get_anchor_name(header: Tag) -> str:
    for tag in header.previous_elements:
        if tag.name == "a":
            return tag.attrs.get("name", "")
    return ""


def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
    has_anchor = False
    for tag in reversed(body):
        if not has_anchor:
            if tag.name == "a":
                has_anchor = True
            continue
        else:
            yield tag


def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
    return reversed(list(_reversed_remove_last_anchor(body)))


def _get_bodys_text(body: list[Tag]) -> str:
    text = ""
    for tag in body:
        text += tag.get_text()
    return text


def _get_child_content(header: Tag) -> Content:
    title = header.get_text()
    name = _get_anchor_name(header)
    body = [header]
    for i, child in enumerate(header.next_elements):
        if i == 0:
            continue
        if child.name == "h2" or child.name == "h3":
            break
        body.append(child)
    removed_next_anchor_body = list(_remove_last_anchor(body))
    text = _get_bodys_text(removed_next_anchor_body)
    return Content(name,
                   title,
                   text,
                   removed_next_anchor_body
                   )


def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]:
    for header in headers:
        yield _get_child_content(header)


class NVDAUserGuideLoader(BaseLoader):
    """
    """
    def __init__(self, url: str, category: str) -> None:
        self.url = url
        self.category = category

    def fetch(self) -> BeautifulSoup:
        res = requests.get(self.url)
        soup = BeautifulSoup(res.content, 'lxml')
        return soup

    def lazy_load(self) -> Iterator[Document]:
        soup = self.fetch()
        # body = soup.body
        headers = soup.find_all(RE_HEADERS)
        for content in get_contents(headers):
            name = content.name
            title = content.title
            text = content.text
            metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title}
            yield Document(page_content=text, metadata=metadata)

    def load(self) -> List[Document]:
        return list(self.lazy_load())


if __name__ == "__main__":
    url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html"
    loader = NVDAUserGuideLoader(url, "en-nvda-user-guide")
    data = loader.load()
    print(data)
    # breakpoint()