super_agent / tools /utils.py
lezaf
Update tools and prompt
9c88759
from langchain.text_splitter import TextSplitter
from langchain.schema import Document
class StructureAwareTextSplitter(TextSplitter):
"""
A custom text splitter that creates context-aware document chunks from structured HTML content.
This splitter buffers paragraphs, lists, and tables together into chunks up to a specified size,
preserving section headers and content structure. Tables are combined with surrounding content
when possible, but split into their own chunk if too large. Useful for web page or wiki-style
content where structure and context are important for downstream retrieval or LLM tasks.
Args:
chunk_size (int): Maximum number of words per chunk.
chunk_overlap (int): Number of words to overlap between chunks (not currently used).
Methods:
split_text(text): Dummy implementation to satisfy the abstract base class.
split_documents(structured_blocks, metadata=None): Splits structured content blocks into
Document objects with preserved section headers and types.
"""
def __init__(self, chunk_size=500, chunk_overlap=50):
super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
#TODO: To be implemented
def split_text(self, text):
# Dummy implementation to satisfy the abstract base class
return [text]
def split_documents(self, structured_blocks, metadata=None):
current_chunk = ""
current_words_cnt = 0
current_header = ""
documents = []
def add_document(content, header, type_):
documents.append(Document(
page_content=content.strip(),
metadata={
"section_header": header,
"type": type_,
**(metadata or {})
}
))
for block in structured_blocks:
type_ = block['type']
if type_ == 'header':
current_header = block['text']
elif type_ in ['paragraph', 'list']:
if type_ == 'paragraph':
text = block['text']
else: # list
text = "\n".join(block['items']) + "\n"
words_cnt = len(text.split())
if current_words_cnt + words_cnt <= self._chunk_size:
current_chunk += text + "\n"
current_words_cnt += words_cnt
else:
add_document(f"{current_header}\n\n{current_chunk}", current_header, type_)
current_chunk = text + "\n"
current_words_cnt = words_cnt
elif type_ == 'table':
table_text = f"{current_header} [Table]\n\n{block['text']}\n"
words_cnt = len(table_text.split())
# Try to buffer table with current chunk if possible
if current_words_cnt + words_cnt <= self._chunk_size:
current_chunk += table_text
current_words_cnt += words_cnt
else:
# If current_chunk is not empty, flush it first
if current_chunk.strip():
add_document(f"{current_header}\n\n{current_chunk}", current_header, 'mixed')
# If table itself is too big, split it alone
if words_cnt > self._chunk_size:
add_document(table_text, current_header, 'table')
current_chunk = ""
current_words_cnt = 0
else:
current_chunk = table_text
current_words_cnt = words_cnt
elif type_ == 'span':
text = block['text']
words_cnt = len(text.split())
if current_words_cnt + words_cnt <= self._chunk_size:
current_chunk += text + "\n"
current_words_cnt += words_cnt
else:
add_document(f"{current_header}\n\n{current_chunk}", current_header, 'mixed')
current_chunk = text + "\n"
current_words_cnt = words_cnt
if current_chunk.strip():
add_document(f"{current_header}\n\n{current_chunk}", current_header, 'mixed')
return documents