Spaces:

zhangyi617
/

webui

Runtime error

File size: 1,591 Bytes

129cd69

"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""

import logging
from typing import Any, Dict, Iterator, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)


class BS4HTMLParser(BaseBlobParser):
    """Pparse HTML files using `Beautiful Soup`."""

    def __init__(
        self,
        *,
        features: str = "lxml",
        get_text_separator: str = "",
        **kwargs: Any,
    ) -> None:
        """Initialize a bs4 based HTML parser."""
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )

        self.bs_kwargs = {"features": features, **kwargs}
        self.get_text_separator = get_text_separator

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Load HTML document into document objects."""
        from bs4 import BeautifulSoup

        with blob.as_bytes_io() as f:
            soup = BeautifulSoup(f, **self.bs_kwargs)

        text = soup.get_text(self.get_text_separator)

        if soup.title:
            title = str(soup.title.string)
        else:
            title = ""

        metadata: Dict[str, Union[str, None]] = {
            "source": blob.source,
            "title": title,
        }
        yield Document(page_content=text, metadata=metadata)