Spaces:
Sleeping
Sleeping
| """Shared chunking utilities for RAG ingestion.""" | |
| import re | |
| DEFAULT_CHUNK_SIZE = 512 | |
| DEFAULT_CHUNK_OVERLAP = 80 | |
| MIN_CHUNK_SIZE = 100 | |
| def _split_into_sentences(text: str) -> list[str]: | |
| """Split text on sentence boundaries (rough heuristic).""" | |
| text = re.sub(r"\n+", "\n", text.strip()) | |
| if not text: | |
| return [] | |
| parts = re.split(r"(?<=[.!?])\s+", text) | |
| return [p.strip() for p in parts if p.strip()] | |
| def chunk_text_semantic( | |
| text: str, | |
| chunk_size: int = DEFAULT_CHUNK_SIZE, | |
| overlap: int = DEFAULT_CHUNK_OVERLAP, | |
| ) -> list[str]: | |
| """ | |
| Semantic chunking: split on paragraphs first, then sentences. | |
| Preserves context better than blind character splits. | |
| """ | |
| text = " ".join(text.split()) | |
| if not text: | |
| return [] | |
| paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()] | |
| if len(paragraphs) <= 1: | |
| sentences = _split_into_sentences(text) | |
| if not sentences: | |
| sentences = [text] | |
| if len(sentences) == 1 and len(sentences[0]) > chunk_size * 2: | |
| return chunk_text_fallback(text, chunk_size, overlap) | |
| paragraphs = sentences | |
| chunks = [] | |
| current_chunk = [] | |
| current_len = 0 | |
| for para in paragraphs: | |
| para_len = len(para) + 1 | |
| if current_len + para_len > chunk_size and current_chunk: | |
| chunk_text = " ".join(current_chunk) | |
| if len(chunk_text) >= MIN_CHUNK_SIZE: | |
| chunks.append(chunk_text) | |
| overlap_len = 0 | |
| overlap_items = [] | |
| for item in reversed(current_chunk): | |
| if overlap_len + len(item) + 1 <= overlap: | |
| overlap_items.insert(0, item) | |
| overlap_len += len(item) + 1 | |
| else: | |
| break | |
| current_chunk = overlap_items | |
| current_len = overlap_len | |
| current_chunk.append(para) | |
| current_len += para_len | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def chunk_text_fallback(text: str, chunk_size: int, overlap: int) -> list[str]: | |
| """Character-based chunking when semantic splitting fails.""" | |
| clean = " ".join(text.split()) | |
| if not clean: | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| step = max(1, chunk_size - overlap) | |
| while start < len(clean): | |
| end = min(len(clean), start + chunk_size) | |
| chunks.append(clean[start:end]) | |
| start += step | |
| return chunks | |