Spaces:
Runtime error
Runtime error
File size: 5,017 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
from typing import Any, Iterator, List, Sequence, cast
from langchain_core.documents import BaseDocumentTransformer, Document
class BeautifulSoupTransformer(BaseDocumentTransformer):
"""Transform HTML content by extracting specific tags and removing unwanted ones.
Example:
.. code-block:: python
from langchain.document_transformers import BeautifulSoupTransformer
bs4_transformer = BeautifulSoupTransformer()
docs_transformed = bs4_transformer.transform_documents(docs)
"""
def __init__(self) -> None:
"""
Initialize the transformer.
This checks if the BeautifulSoup4 package is installed.
If not, it raises an ImportError.
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ImportError(
"BeautifulSoup4 is required for BeautifulSoupTransformer. "
"Please install it with `pip install beautifulsoup4`."
)
def transform_documents(
self,
documents: Sequence[Document],
unwanted_tags: List[str] = ["script", "style"],
tags_to_extract: List[str] = ["p", "li", "div", "a"],
remove_lines: bool = True,
**kwargs: Any,
) -> Sequence[Document]:
"""
Transform a list of Document objects by cleaning their HTML content.
Args:
documents: A sequence of Document objects containing HTML content.
unwanted_tags: A list of tags to be removed from the HTML.
tags_to_extract: A list of tags whose content will be extracted.
remove_lines: If set to True, unnecessary lines will be
removed from the HTML content.
Returns:
A sequence of Document objects with transformed content.
"""
for doc in documents:
cleaned_content = doc.page_content
cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)
cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
if remove_lines:
cleaned_content = self.remove_unnecessary_lines(cleaned_content)
doc.page_content = cleaned_content
return documents
@staticmethod
def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
"""
Remove unwanted tags from a given HTML content.
Args:
html_content: The original HTML content string.
unwanted_tags: A list of tags to be removed from the HTML.
Returns:
A cleaned HTML string with unwanted tags removed.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
for tag in unwanted_tags:
for element in soup.find_all(tag):
element.decompose()
return str(soup)
@staticmethod
def extract_tags(html_content: str, tags: List[str]) -> str:
"""
Extract specific tags from a given HTML content.
Args:
html_content: The original HTML content string.
tags: A list of tags to be extracted from the HTML.
Returns:
A string combining the content of the extracted tags.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
text_parts: List[str] = []
for element in soup.find_all():
if element.name in tags:
# Extract all navigable strings recursively from this element.
text_parts += get_navigable_strings(element)
# To avoid duplicate text, remove all descendants from the soup.
element.decompose()
return " ".join(text_parts)
@staticmethod
def remove_unnecessary_lines(content: str) -> str:
"""
Clean up the content by removing unnecessary lines.
Args:
content: A string, which may contain unnecessary lines or spaces.
Returns:
A cleaned string with unnecessary lines removed.
"""
lines = content.split("\n")
stripped_lines = [line.strip() for line in lines]
non_empty_lines = [line for line in stripped_lines if line]
cleaned_content = " ".join(non_empty_lines)
return cleaned_content
async def atransform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
raise NotImplementedError
def get_navigable_strings(element: Any) -> Iterator[str]:
from bs4 import NavigableString, Tag
for child in cast(Tag, element).children:
if isinstance(child, Tag):
yield from get_navigable_strings(child)
elif isinstance(child, NavigableString):
if (element.name == "a") and (href := element.get("href")):
yield f"{child.strip()} ({href})"
else:
yield child.strip()
|