buster / buster /parser.py
hbertrand's picture
FIX: strip section name (#54)
2457179 unverified
raw history blame
No virus
4.42 kB
import math
import os
from abc import ABC, abstractmethod
from dataclasses import InitVar, dataclass, field
from itertools import takewhile, zip_longest
from typing import Iterator
import bs4
import pandas as pd
from bs4 import BeautifulSoup
@dataclass
class Section:
url: str
name: str
nodes: InitVar[list[bs4.element.NavigableString]]
text: str = field(init=False)
def __post_init__(self, nodes: list[bs4.element.NavigableString]):
section = []
for node in nodes:
if node.name == "table":
node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
elif node.name == "script":
continue
else:
node_text = node.text
section.append(node_text)
self.text = "".join(section).strip()
def __len__(self) -> int:
return len(self.text)
@classmethod
def from_text(cls, text: str, url: str, name: str) -> "Section":
"""Alternate constructor, without parsing."""
section = cls.__new__(cls) # Allocate memory, does not call __init__
# Does the init here.
section.text = text
section.url = url
section.name = name
return section
def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
"""Split a section into chunks."""
if len(self) > max_length:
# Get the number of chunk, by dividing and rounding up.
# Then, split the section into equal lenght chunks.
# This could results in chunks below the minimum length,
# and will truncate the end of the section.
n_chunks = (len(self) + max_length - 1) // max_length
length = len(self) // n_chunks
for chunk in range(n_chunks):
start = chunk * length
yield Section.from_text(self.text[start : start + length], self.url, self.name)
elif len(self) > min_length:
yield self
return
@dataclass
class Parser(ABC):
soup: BeautifulSoup
base_url: str
filename: str
min_section_length: int = 100
max_section_length: int = 2000
@abstractmethod
def build_url(self, suffix: str) -> str:
...
@abstractmethod
def find_sections(self) -> Iterator[Section]:
...
def parse(self) -> list[Section]:
"""Parse the documents into sections, respecting the lenght constraints."""
sections = []
for section in self.find_sections():
sections.extend(section.get_chunks(self.min_section_length, self.max_section_length))
return sections
class SphinxParser(Parser):
def find_sections(self) -> Iterator[Section]:
for section in self.soup.find_all("a", href=True, class_="headerlink"):
container = section.parent.parent
section_href = container.find_all("a", href=True, class_="headerlink")
url = self.build_url(section["href"].strip().replace("\n", ""))
name = section.parent.text.strip()[:-1].replace("\n", "")
# If sections has subsections, keep only the part before the first subsection
if len(section_href) > 1 and container.section is not None:
siblings = list(container.section.previous_siblings)[::-1]
section = Section(url, name, siblings)
else:
section = Section(url, name, container.children)
yield section
return
def build_url(self, suffix: str) -> str:
return self.base_url + self.filename + suffix
class HuggingfaceParser(Parser):
def find_sections(self) -> Iterator[Section]:
sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
for section, next_section in zip_longest(sections, sections[1:]):
href = section.find("a", href=True, class_="header-link")
nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))
url = self.build_url(href["href"].strip().replace("\n", ""))
name = section.text.strip().replace("\n", "")
yield Section(url, name, nodes)
return
def build_url(self, suffix: str) -> str:
# The splitext is to remove the .html extension
return self.base_url + os.path.splitext(self.filename)[0] + suffix