Spaces:
Runtime error
Runtime error
from typing import Any, Iterator, List, Sequence, cast | |
from langchain_core.documents import BaseDocumentTransformer, Document | |
class BeautifulSoupTransformer(BaseDocumentTransformer): | |
"""Transform HTML content by extracting specific tags and removing unwanted ones. | |
Example: | |
.. code-block:: python | |
from langchain.document_transformers import BeautifulSoupTransformer | |
bs4_transformer = BeautifulSoupTransformer() | |
docs_transformed = bs4_transformer.transform_documents(docs) | |
""" | |
def __init__(self) -> None: | |
""" | |
Initialize the transformer. | |
This checks if the BeautifulSoup4 package is installed. | |
If not, it raises an ImportError. | |
""" | |
try: | |
import bs4 # noqa:F401 | |
except ImportError: | |
raise ImportError( | |
"BeautifulSoup4 is required for BeautifulSoupTransformer. " | |
"Please install it with `pip install beautifulsoup4`." | |
) | |
def transform_documents( | |
self, | |
documents: Sequence[Document], | |
unwanted_tags: List[str] = ["script", "style"], | |
tags_to_extract: List[str] = ["p", "li", "div", "a"], | |
remove_lines: bool = True, | |
**kwargs: Any, | |
) -> Sequence[Document]: | |
""" | |
Transform a list of Document objects by cleaning their HTML content. | |
Args: | |
documents: A sequence of Document objects containing HTML content. | |
unwanted_tags: A list of tags to be removed from the HTML. | |
tags_to_extract: A list of tags whose content will be extracted. | |
remove_lines: If set to True, unnecessary lines will be | |
removed from the HTML content. | |
Returns: | |
A sequence of Document objects with transformed content. | |
""" | |
for doc in documents: | |
cleaned_content = doc.page_content | |
cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags) | |
cleaned_content = self.extract_tags(cleaned_content, tags_to_extract) | |
if remove_lines: | |
cleaned_content = self.remove_unnecessary_lines(cleaned_content) | |
doc.page_content = cleaned_content | |
return documents | |
def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str: | |
""" | |
Remove unwanted tags from a given HTML content. | |
Args: | |
html_content: The original HTML content string. | |
unwanted_tags: A list of tags to be removed from the HTML. | |
Returns: | |
A cleaned HTML string with unwanted tags removed. | |
""" | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(html_content, "html.parser") | |
for tag in unwanted_tags: | |
for element in soup.find_all(tag): | |
element.decompose() | |
return str(soup) | |
def extract_tags(html_content: str, tags: List[str]) -> str: | |
""" | |
Extract specific tags from a given HTML content. | |
Args: | |
html_content: The original HTML content string. | |
tags: A list of tags to be extracted from the HTML. | |
Returns: | |
A string combining the content of the extracted tags. | |
""" | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(html_content, "html.parser") | |
text_parts: List[str] = [] | |
for element in soup.find_all(): | |
if element.name in tags: | |
# Extract all navigable strings recursively from this element. | |
text_parts += get_navigable_strings(element) | |
# To avoid duplicate text, remove all descendants from the soup. | |
element.decompose() | |
return " ".join(text_parts) | |
def remove_unnecessary_lines(content: str) -> str: | |
""" | |
Clean up the content by removing unnecessary lines. | |
Args: | |
content: A string, which may contain unnecessary lines or spaces. | |
Returns: | |
A cleaned string with unnecessary lines removed. | |
""" | |
lines = content.split("\n") | |
stripped_lines = [line.strip() for line in lines] | |
non_empty_lines = [line for line in stripped_lines if line] | |
cleaned_content = " ".join(non_empty_lines) | |
return cleaned_content | |
async def atransform_documents( | |
self, | |
documents: Sequence[Document], | |
**kwargs: Any, | |
) -> Sequence[Document]: | |
raise NotImplementedError | |
def get_navigable_strings(element: Any) -> Iterator[str]: | |
from bs4 import NavigableString, Tag | |
for child in cast(Tag, element).children: | |
if isinstance(child, Tag): | |
yield from get_navigable_strings(child) | |
elif isinstance(child, NavigableString): | |
if (element.name == "a") and (href := element.get("href")): | |
yield f"{child.strip()} ({href})" | |
else: | |
yield child.strip() | |