Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto commited on 5 days ago

Commit

75fd700

1 Parent(s): 7711406

feat(rag): paragraph-aware chunker (chunk_text)

Browse files

Files changed (4) hide show

src/rag/__init__.py +0 -0
src/rag/chunker.py +39 -0
tests/rag/__init__.py +0 -0
tests/rag/test_chunker.py +40 -0

src/rag/__init__.py ADDED Viewed

File without changes

src/rag/chunker.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Paragraph-aware recursive character splitter for RAG ingestion.
+Public entry: `chunk_text(text, max_chars, overlap)`. Splits on the first
+of [paragraph break, sentence end, newline, space] that fits inside the
+window. Empty / whitespace-only inputs return [].
+"""
+from __future__ import annotations
+_SEPARATORS: tuple[str, ...] = ("\n\n", ". ", "\n", " ")
+def chunk_text(text: str, max_chars: int = 600, overlap: int = 80) -> list[str]:
+    """Split `text` into chunks of at most `max_chars`, with `overlap` carry-over."""
+    text = text.strip()
+    if not text:
+        return []
+    if len(text) <= max_chars:
+        return [text]
+    chunks: list[str] = []
+    start = 0
+    n = len(text)
+    while start < n:
+        end = min(start + max_chars, n)
+        if end < n:
+            # try to land on a clean boundary inside [start, end]
+            for sep in _SEPARATORS:
+                last = text.rfind(sep, start, end)
+                if last > start:
+                    end = last + len(sep)
+                    break
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        if end >= n:
+            break
+        start = max(start + 1, end - overlap)
+    return chunks

tests/rag/__init__.py ADDED Viewed

File without changes

tests/rag/test_chunker.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Tests for src.rag.chunker — paragraph-aware character splitter."""
+from __future__ import annotations
+import pytest
+from src.rag.chunker import chunk_text
+class TestChunkText:
+    def test_short_text_returns_single_chunk(self) -> None:
+        out = chunk_text("hello world", max_chars=100, overlap=10)
+        assert out == ["hello world"]
+    def test_empty_text_returns_empty_list(self) -> None:
+        assert chunk_text("", max_chars=100, overlap=10) == []
+        assert chunk_text("   \n\n  ", max_chars=100, overlap=10) == []
+    def test_long_text_splits_into_multiple_chunks(self) -> None:
+        text = "a" * 250
+        out = chunk_text(text, max_chars=100, overlap=10)
+        assert len(out) >= 3
+        # every chunk respects max_chars
+        for c in out:
+            assert len(c) <= 100
+    def test_overlap_between_chunks(self) -> None:
+        text = "abcdefghij" * 30  # 300 chars, no natural break
+        out = chunk_text(text, max_chars=100, overlap=20)
+        # consecutive chunks share at least some characters
+        for i in range(len(out) - 1):
+            assert out[i][-10:] in out[i + 1] or out[i + 1][:10] in out[i]
+    def test_paragraph_boundary_preferred(self) -> None:
+        # First paragraph fits, second doesn't — split at \n\n
+        para_a = "First paragraph content."
+        para_b = "Second paragraph content " * 10
+        text = f"{para_a}\n\n{para_b}"
+        out = chunk_text(text, max_chars=100, overlap=10)
+        # first chunk should end at the paragraph boundary, not mid-word
+        assert para_a in out[0]