File size: 1,591 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""

import logging
from typing import Any, Dict, Iterator, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)


class BS4HTMLParser(BaseBlobParser):
    """Pparse HTML files using `Beautiful Soup`."""

    def __init__(
        self,
        *,
        features: str = "lxml",
        get_text_separator: str = "",
        **kwargs: Any,
    ) -> None:
        """Initialize a bs4 based HTML parser."""
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )

        self.bs_kwargs = {"features": features, **kwargs}
        self.get_text_separator = get_text_separator

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Load HTML document into document objects."""
        from bs4 import BeautifulSoup

        with blob.as_bytes_io() as f:
            soup = BeautifulSoup(f, **self.bs_kwargs)

        text = soup.get_text(self.get_text_separator)

        if soup.title:
            title = str(soup.title.string)
        else:
            title = ""

        metadata: Dict[str, Union[str, None]] = {
            "source": blob.source,
            "title": title,
        }
        yield Document(page_content=text, metadata=metadata)