Spaces:
Runtime error
Runtime error
import os | |
import re | |
from abc import ABC, abstractmethod | |
from dataclasses import InitVar, dataclass, field | |
from itertools import takewhile, zip_longest | |
from typing import Iterator | |
import bs4 | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
class Section: | |
url: str | |
name: str | |
nodes: InitVar[list[bs4.element.NavigableString]] | |
text: str = field(init=False) | |
def __post_init__(self, nodes: list[bs4.element.NavigableString]): | |
section = [] | |
for node in nodes: | |
if node.name == "table": | |
node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github") | |
elif node.name == "script": | |
continue | |
else: | |
node_text = node.text | |
section.append(node_text) | |
self.text = "\n".join(section).strip() | |
# Remove tabs | |
self.text = self.text.replace("\t", "") | |
# Replace group of newlines with a single newline | |
self.text = re.sub("\n{2,}", "\n", self.text) | |
# Replace non-breaking spaces with regular spaces | |
self.text = self.text.replace("\xa0", " ") | |
def __len__(self) -> int: | |
return len(self.text) | |
def from_text(cls, text: str, url: str, name: str) -> "Section": | |
"""Alternate constructor, without parsing.""" | |
section = cls.__new__(cls) # Allocate memory, does not call __init__ | |
# Does the init here. | |
section.text = text | |
section.url = url | |
section.name = name | |
return section | |
def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]: | |
"""Split a section into chunks.""" | |
if len(self) > max_length: | |
# Get the number of chunk, by dividing and rounding up. | |
# Then, split the section into equal lenght chunks. | |
# This could results in chunks below the minimum length, | |
# and will truncate the end of the section. | |
n_chunks = (len(self) + max_length - 1) // max_length | |
length = len(self) // n_chunks | |
for chunk in range(n_chunks): | |
start = chunk * length | |
yield Section.from_text(self.text[start : start + length], self.url, self.name) | |
elif len(self) > min_length: | |
yield self | |
return | |
class Parser(ABC): | |
soup: BeautifulSoup | |
base_url: str | |
filename: str | |
min_section_length: int = 100 | |
max_section_length: int = 2000 | |
def build_url(self, suffix: str) -> str: | |
... | |
def find_sections(self) -> Iterator[Section]: | |
... | |
def parse(self) -> list[Section]: | |
"""Parse the documents into sections, respecting the lenght constraints.""" | |
sections = [] | |
for section in self.find_sections(): | |
sections.extend(section.get_chunks(self.min_section_length, self.max_section_length)) | |
return sections | |
class SphinxParser(Parser): | |
def find_sections(self) -> Iterator[Section]: | |
for section in self.soup.find_all("a", href=True, class_="headerlink"): | |
container = section.parent.parent | |
section_href = container.find_all("a", href=True, class_="headerlink") | |
url = self.build_url(section["href"].strip().replace("\n", "")) | |
name = section.parent.text.strip()[:-1].replace("\n", "") | |
# If sections has subsections, keep only the part before the first subsection | |
if len(section_href) > 1 and container.section is not None: | |
siblings = list(container.section.previous_siblings)[::-1] | |
section = Section(url, name, siblings) | |
else: | |
section = Section(url, name, container.children) | |
yield section | |
return | |
def build_url(self, suffix: str) -> str: | |
return self.base_url + self.filename + suffix | |
class HuggingfaceParser(Parser): | |
def find_sections(self) -> Iterator[Section]: | |
sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group") | |
for section, next_section in zip_longest(sections, sections[1:]): | |
href = section.find("a", href=True, class_="header-link") | |
nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings())) | |
url = self.build_url(href["href"].strip().replace("\n", "")) | |
name = section.text.strip().replace("\n", "") | |
yield Section(url, name, nodes) | |
return | |
def build_url(self, suffix: str) -> str: | |
# The splitext is to remove the .html extension | |
return self.base_url + os.path.splitext(self.filename)[0] + suffix | |