|  | from transformers import AutoTokenizer | 
					
						
						|  | from typing import List | 
					
						
						|  | import logging | 
					
						
						|  |  | 
					
						
						|  | logging.basicConfig(level=logging.INFO) | 
					
						
						|  | logger = logging.getLogger("chunker") | 
					
						
						|  |  | 
					
						
						|  | class DocumentChunker: | 
					
						
						|  | def __init__(self): | 
					
						
						|  | self.tokenizer = None | 
					
						
						|  |  | 
					
						
						|  | def initialize_tokenizer(self, model_name="VincentMuriuki/legal-summarizer"): | 
					
						
						|  | """Initialize tokenizer if not already done""" | 
					
						
						|  | if self.tokenizer is None: | 
					
						
						|  | print(f"π€ Loading tokenizer: {model_name}") | 
					
						
						|  | self.tokenizer = AutoTokenizer.from_pretrained(model_name) | 
					
						
						|  |  | 
					
						
						|  | def chunk_by_tokens(self, text: str, max_tokens: int = 1600, stride: int = 50) -> List[str]: | 
					
						
						|  | """Fast token-window chunking without NLTK dependency""" | 
					
						
						|  | self.initialize_tokenizer() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | ids = self.tokenizer.encode(text, add_special_tokens=False) | 
					
						
						|  | chunks = [] | 
					
						
						|  | i, n = 0, len(ids) | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"π Total tokens: {n}, creating chunks with max_tokens: {max_tokens}") | 
					
						
						|  |  | 
					
						
						|  | while i < n: | 
					
						
						|  | j = min(i + max_tokens, n) | 
					
						
						|  | chunk_ids = ids[i:j] | 
					
						
						|  | chunk_text = self.tokenizer.decode(chunk_ids, skip_special_tokens=True) | 
					
						
						|  | chunks.append(chunk_text) | 
					
						
						|  |  | 
					
						
						|  | if j == n: | 
					
						
						|  | break | 
					
						
						|  | i = max(j - stride, 0) | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"βοΈ Created {len(chunks)} chunks") | 
					
						
						|  | return chunks | 
					
						
						|  |  |