File size: 4,717 Bytes
6f6984a
ebace01
2f25f03
 
 
909ae3f
f5ec40e
 
 
 
 
 
2f25f03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebace01
 
 
 
 
 
 
 
 
 
2f25f03
 
 
 
 
909ae3f
2f25f03
 
 
 
 
 
 
 
 
909ae3f
2f25f03
 
 
 
 
 
 
 
 
 
909ae3f
2f25f03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5ec40e
 
2f25f03
909ae3f
f5ec40e
 
2f25f03
 
 
 
 
 
f5ec40e
 
 
909ae3f
2f25f03
 
 
f5ec40e
2f25f03
2457179
f5ec40e
 
2f25f03
 
 
f5ec40e
2f25f03
 
 
f5ec40e
 
 
 
 
 
909ae3f
2f25f03
 
 
 
 
 
 
 
 
 
f5ec40e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import re
from abc import ABC, abstractmethod
from dataclasses import InitVar, dataclass, field
from itertools import takewhile, zip_longest
from typing import Iterator

import bs4
import pandas as pd
from bs4 import BeautifulSoup


@dataclass
class Section:
    url: str
    name: str
    nodes: InitVar[list[bs4.element.NavigableString]]
    text: str = field(init=False)

    def __post_init__(self, nodes: list[bs4.element.NavigableString]):
        section = []
        for node in nodes:
            if node.name == "table":
                node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
            elif node.name == "script":
                continue
            else:
                node_text = node.text
            section.append(node_text)
        self.text = "\n".join(section).strip()

        # Remove tabs
        self.text = self.text.replace("\t", "")

        # Replace group of newlines with a single newline
        self.text = re.sub("\n{2,}", "\n", self.text)

        # Replace non-breaking spaces with regular spaces
        self.text = self.text.replace("\xa0", " ")

    def __len__(self) -> int:
        return len(self.text)

    @classmethod
    def from_text(cls, text: str, url: str, name: str) -> "Section":
        """Alternate constructor, without parsing."""
        section = cls.__new__(cls)  # Allocate memory, does not call __init__
        # Does the init here.
        section.text = text
        section.url = url
        section.name = name

        return section

    def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
        """Split a section into chunks."""
        if len(self) > max_length:
            # Get the number of chunk, by dividing and rounding up.
            # Then, split the section into equal lenght chunks.
            # This could results in chunks below the minimum length,
            # and will truncate the end of the section.
            n_chunks = (len(self) + max_length - 1) // max_length
            length = len(self) // n_chunks
            for chunk in range(n_chunks):
                start = chunk * length
                yield Section.from_text(self.text[start : start + length], self.url, self.name)
        elif len(self) > min_length:
            yield self
        return


@dataclass
class Parser(ABC):
    soup: BeautifulSoup
    base_url: str
    filename: str
    min_section_length: int = 100
    max_section_length: int = 2000

    @abstractmethod
    def build_url(self, suffix: str) -> str:
        ...

    @abstractmethod
    def find_sections(self) -> Iterator[Section]:
        ...

    def parse(self) -> list[Section]:
        """Parse the documents into sections, respecting the lenght constraints."""
        sections = []
        for section in self.find_sections():
            sections.extend(section.get_chunks(self.min_section_length, self.max_section_length))
        return sections


class SphinxParser(Parser):
    def find_sections(self) -> Iterator[Section]:
        for section in self.soup.find_all("a", href=True, class_="headerlink"):
            container = section.parent.parent
            section_href = container.find_all("a", href=True, class_="headerlink")

            url = self.build_url(section["href"].strip().replace("\n", ""))
            name = section.parent.text.strip()[:-1].replace("\n", "")

            # If sections has subsections, keep only the part before the first subsection
            if len(section_href) > 1 and container.section is not None:
                siblings = list(container.section.previous_siblings)[::-1]
                section = Section(url, name, siblings)
            else:
                section = Section(url, name, container.children)
            yield section
        return

    def build_url(self, suffix: str) -> str:
        return self.base_url + self.filename + suffix


class HuggingfaceParser(Parser):
    def find_sections(self) -> Iterator[Section]:
        sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
        for section, next_section in zip_longest(sections, sections[1:]):
            href = section.find("a", href=True, class_="header-link")
            nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))

            url = self.build_url(href["href"].strip().replace("\n", ""))
            name = section.text.strip().replace("\n", "")
            yield Section(url, name, nodes)

        return

    def build_url(self, suffix: str) -> str:
        # The splitext is to remove the .html extension
        return self.base_url + os.path.splitext(self.filename)[0] + suffix