File size: 1,469 Bytes
9eadfc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# import nltk
# nltk.download('punkt')  # Only needed once
# from nltk.tokenize import sent_tokenize


from typing import List
import re
from nltk.tokenize import sent_tokenize

def split_by_sections(text: str) -> List[str]:
    """
    Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'.
    """
    section_pattern = re.compile(r'(Section\s+\d+[^\n]*|Sec\.\s*\d+[^\n]*)', re.IGNORECASE)
    parts = section_pattern.split(text)

    chunks = []
    for i in range(1, len(parts), 2):
        header = parts[i].strip()
        content = parts[i + 1].strip() if i + 1 < len(parts) else ""
        chunks.append(f"{header}\n{content}")
    return chunks

def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]:
    """
    Split a long legal document into semantically meaningful chunks, with a fallback
    to split section-wise using section headers.
    """
    final_chunks = []
    section_chunks = split_by_sections(text)

    for section in section_chunks:
        sentences = sent_tokenize(section)
        current_chunk = ""
        for sentence in sentences:
            if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
                current_chunk += " " + sentence
            else:
                final_chunks.append(current_chunk.strip())
                current_chunk = sentence
        if current_chunk:
            final_chunks.append(current_chunk.strip())

    return final_chunks