Spaces:

allenlsl
/

legal_rag

Running

legal_rag / smart_chunk.py

Update smart_chunk.py

9eadfc9 verified 24 days ago

1.47 kB

	# import nltk
	# nltk.download('punkt') # Only needed once
	# from nltk.tokenize import sent_tokenize


	from typing import List
	import re
	from nltk.tokenize import sent_tokenize

	def split_by_sections(text: str) -> List[str]:
	"""
	Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'.
	"""
	section_pattern = re.compile(r'(Section\s+\d+[^\n]\|Sec\.\s\d+[^\n]*)', re.IGNORECASE)
	parts = section_pattern.split(text)

	chunks = []
	for i in range(1, len(parts), 2):
	header = parts[i].strip()
	content = parts[i + 1].strip() if i + 1 < len(parts) else ""
	chunks.append(f"{header}\n{content}")
	return chunks

	def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]:
	"""
	Split a long legal document into semantically meaningful chunks, with a fallback
	to split section-wise using section headers.
	"""
	final_chunks = []
	section_chunks = split_by_sections(text)

	for section in section_chunks:
	sentences = sent_tokenize(section)
	current_chunk = ""
	for sentence in sentences:
	if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
	current_chunk += " " + sentence
	else:
	final_chunks.append(current_chunk.strip())
	current_chunk = sentence
	if current_chunk:
	final_chunks.append(current_chunk.strip())

	return final_chunks