Spaces:
Runtime error
Runtime error
from typing import Any, Sequence | |
from langchain_core.documents import BaseDocumentTransformer, Document | |
class Html2TextTransformer(BaseDocumentTransformer): | |
"""Replace occurrences of a particular search pattern with a replacement string | |
Arguments: | |
ignore_links: Whether links should be ignored; defaults to True. | |
ignore_images: Whether images should be ignored; defaults to True. | |
Example: | |
.. code-block:: python | |
from langchain.document_transformers import Html2TextTransformer | |
html2text = Html2TextTransformer() | |
docs_transform = html2text.transform_documents(docs) | |
""" | |
def __init__(self, ignore_links: bool = True, ignore_images: bool = True) -> None: | |
self.ignore_links = ignore_links | |
self.ignore_images = ignore_images | |
def transform_documents( | |
self, | |
documents: Sequence[Document], | |
**kwargs: Any, | |
) -> Sequence[Document]: | |
try: | |
import html2text | |
except ImportError: | |
raise ImportError( | |
"""html2text package not found, please | |
install it with `pip install html2text`""" | |
) | |
# Create a html2text.HTML2Text object and override some properties | |
h = html2text.HTML2Text() | |
h.ignore_links = self.ignore_links | |
h.ignore_images = self.ignore_images | |
for d in documents: | |
d.page_content = h.handle(d.page_content) | |
return documents | |
async def atransform_documents( | |
self, | |
documents: Sequence[Document], | |
**kwargs: Any, | |
) -> Sequence[Document]: | |
raise NotImplementedError | |