File size: 1,469 Bytes
9eadfc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# import nltk
# nltk.download('punkt') # Only needed once
# from nltk.tokenize import sent_tokenize
from typing import List
import re
from nltk.tokenize import sent_tokenize
def split_by_sections(text: str) -> List[str]:
"""
Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'.
"""
section_pattern = re.compile(r'(Section\s+\d+[^\n]*|Sec\.\s*\d+[^\n]*)', re.IGNORECASE)
parts = section_pattern.split(text)
chunks = []
for i in range(1, len(parts), 2):
header = parts[i].strip()
content = parts[i + 1].strip() if i + 1 < len(parts) else ""
chunks.append(f"{header}\n{content}")
return chunks
def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]:
"""
Split a long legal document into semantically meaningful chunks, with a fallback
to split section-wise using section headers.
"""
final_chunks = []
section_chunks = split_by_sections(text)
for section in section_chunks:
sentences = sent_tokenize(section)
current_chunk = ""
for sentence in sentences:
if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
current_chunk += " " + sentence
else:
final_chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
final_chunks.append(current_chunk.strip())
return final_chunks
|