|
|
|
|
|
|
|
|
|
|
|
from typing import List |
|
import re |
|
from nltk.tokenize import sent_tokenize |
|
|
|
def split_by_sections(text: str) -> List[str]: |
|
""" |
|
Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'. |
|
""" |
|
section_pattern = re.compile(r'(Section\s+\d+[^\n]*|Sec\.\s*\d+[^\n]*)', re.IGNORECASE) |
|
parts = section_pattern.split(text) |
|
|
|
chunks = [] |
|
for i in range(1, len(parts), 2): |
|
header = parts[i].strip() |
|
content = parts[i + 1].strip() if i + 1 < len(parts) else "" |
|
chunks.append(f"{header}\n{content}") |
|
return chunks |
|
|
|
def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]: |
|
""" |
|
Split a long legal document into semantically meaningful chunks, with a fallback |
|
to split section-wise using section headers. |
|
""" |
|
final_chunks = [] |
|
section_chunks = split_by_sections(text) |
|
|
|
for section in section_chunks: |
|
sentences = sent_tokenize(section) |
|
current_chunk = "" |
|
for sentence in sentences: |
|
if len(current_chunk.split()) + len(sentence.split()) <= max_tokens: |
|
current_chunk += " " + sentence |
|
else: |
|
final_chunks.append(current_chunk.strip()) |
|
current_chunk = sentence |
|
if current_chunk: |
|
final_chunks.append(current_chunk.strip()) |
|
|
|
return final_chunks |
|
|
|
|