import math
import os
from abc import ABC, abstractmethod
from dataclasses import InitVar, dataclass, field
from itertools import takewhile, zip_longest
from typing import Iterator

import bs4
import pandas as pd
from bs4 import BeautifulSoup


@dataclass
class Section:
    url: str
    name: str
    nodes: InitVar[list[bs4.element.NavigableString]]
    text: str = field(init=False)

    def __post_init__(self, nodes: list[bs4.element.NavigableString]):
        section = []
        for node in nodes:
            if node.name == "table":
                node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
            elif node.name == "script":
                continue
            else:
                node_text = node.text
            section.append(node_text)
        self.text = "".join(section).strip()

    def __len__(self) -> int:
        return len(self.text)

    @classmethod
    def from_text(cls, text: str, url: str, name: str) -> "Section":
        """Alternate constructor, without parsing."""
        section = cls.__new__(cls)  # Allocate memory, does not call __init__
        # Does the init here.
        section.text = text
        section.url = url
        section.name = name

        return section

    def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
        """Split a section into chunks."""
        if len(self) > max_length:
            # Get the number of chunk, by dividing and rounding up.
            # Then, split the section into equal lenght chunks.
            # This could results in chunks below the minimum length,
            # and will truncate the end of the section.
            n_chunks = (len(self) + max_length - 1) // max_length
            length = len(self) // n_chunks
            for chunk in range(n_chunks):
                start = chunk * length
                yield Section.from_text(self.text[start : start + length], self.url, self.name)
        elif len(self) > min_length:
            yield self
        return


@dataclass
class Parser(ABC):
    soup: BeautifulSoup
    base_url: str
    filename: str
    min_section_length: int = 100
    max_section_length: int = 2000

    @abstractmethod
    def build_url(self, suffix: str) -> str:
        ...

    @abstractmethod
    def find_sections(self) -> Iterator[Section]:
        ...

    def parse(self) -> list[Section]:
        """Parse the documents into sections, respecting the lenght constraints."""
        sections = []
        for section in self.find_sections():
            sections.extend(section.get_chunks(self.min_section_length, self.max_section_length))
        return sections


class SphinxParser(Parser):
    def find_sections(self) -> Iterator[Section]:
        for section in self.soup.find_all("a", href=True, class_="headerlink"):
            container = section.parent.parent
            section_href = container.find_all("a", href=True, class_="headerlink")

            url = self.build_url(section["href"].strip().replace("\n", ""))
            name = section.parent.text.strip()[:-1].replace("\n", "")

            # If sections has subsections, keep only the part before the first subsection
            if len(section_href) > 1 and container.section is not None:
                siblings = list(container.section.previous_siblings)[::-1]
                section = Section(url, name, siblings)
            else:
                section = Section(url, name, container.children)
            yield section
        return

    def build_url(self, suffix: str) -> str:
        return self.base_url + self.filename + suffix


class HuggingfaceParser(Parser):
    def find_sections(self) -> Iterator[Section]:
        sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
        for section, next_section in zip_longest(sections, sections[1:]):
            href = section.find("a", href=True, class_="header-link")
            nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))

            url = self.build_url(href["href"].strip().replace("\n", ""))
            name = section.text.strip().replace("\n", "")
            yield Section(url, name, nodes)

        return

    def build_url(self, suffix: str) -> str:
        # The splitext is to remove the .html extension
        return self.base_url + os.path.splitext(self.filename)[0] + suffix