File size: 4,374 Bytes
dfad45c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c88759
 
 
 
 
 
 
 
 
 
 
dfad45c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from langchain.text_splitter import TextSplitter
from langchain.schema import Document

class StructureAwareTextSplitter(TextSplitter):
    """
    A custom text splitter that creates context-aware document chunks from structured HTML content.

    This splitter buffers paragraphs, lists, and tables together into chunks up to a specified size,
    preserving section headers and content structure. Tables are combined with surrounding content
    when possible, but split into their own chunk if too large. Useful for web page or wiki-style
    content where structure and context are important for downstream retrieval or LLM tasks.

    Args:
        chunk_size (int): Maximum number of words per chunk.
        chunk_overlap (int): Number of words to overlap between chunks (not currently used).

    Methods:
        split_text(text): Dummy implementation to satisfy the abstract base class.
        split_documents(structured_blocks, metadata=None): Splits structured content blocks into
            Document objects with preserved section headers and types.
    """
    def __init__(self, chunk_size=500, chunk_overlap=50):
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    #TODO: To be implemented
    def split_text(self, text):
        # Dummy implementation to satisfy the abstract base class
        return [text]

    def split_documents(self, structured_blocks, metadata=None):
        current_chunk = ""
        current_words_cnt = 0
        current_header = ""
        documents = []

        def add_document(content, header, type_):
            documents.append(Document(
                page_content=content.strip(),
                metadata={
                    "section_header": header,
                    "type": type_,
                    **(metadata or {})
                }
            ))

        for block in structured_blocks:
            type_ = block['type']
            if type_ == 'header':
                current_header = block['text']

            elif type_ in ['paragraph', 'list']:
                if type_ == 'paragraph':
                    text = block['text']
                else:  # list
                    text = "\n".join(block['items']) + "\n"
                words_cnt = len(text.split())
                if current_words_cnt + words_cnt <= self._chunk_size:
                    current_chunk += text + "\n"
                    current_words_cnt += words_cnt
                else:
                    add_document(f"{current_header}\n\n{current_chunk}", current_header, type_)
                    current_chunk = text + "\n"
                    current_words_cnt = words_cnt

            elif type_ == 'table':
                table_text = f"{current_header} [Table]\n\n{block['text']}\n"
                words_cnt = len(table_text.split())
                # Try to buffer table with current chunk if possible
                if current_words_cnt + words_cnt <= self._chunk_size:
                    current_chunk += table_text
                    current_words_cnt += words_cnt
                else:
                    # If current_chunk is not empty, flush it first
                    if current_chunk.strip():
                        add_document(f"{current_header}\n\n{current_chunk}", current_header, 'mixed')
                    # If table itself is too big, split it alone
                    if words_cnt > self._chunk_size:
                        add_document(table_text, current_header, 'table')
                        current_chunk = ""
                        current_words_cnt = 0
                    else:
                        current_chunk = table_text
                        current_words_cnt = words_cnt

            elif type_ == 'span':
                text = block['text']
                words_cnt = len(text.split())
                if current_words_cnt + words_cnt <= self._chunk_size:
                    current_chunk += text + "\n"
                    current_words_cnt += words_cnt
                else:
                    add_document(f"{current_header}\n\n{current_chunk}", current_header, 'mixed')
                    current_chunk = text + "\n"
                    current_words_cnt = words_cnt

        if current_chunk.strip():
            add_document(f"{current_header}\n\n{current_chunk}", current_header, 'mixed')

        return documents