Spaces:

mtyrrell
/

chatfed_ingestor

Sleeping

App Files Files Community

mtyrrell commited on Aug 27

Commit

698f034

1 Parent(s): 537051a

recursive chunking

Browse files

Files changed (2) hide show

app/main.py +25 -80
requirements.txt +3 -0

app/main.py CHANGED Viewed

@@ -16,6 +16,9 @@ from pathlib import Path
 import PyPDF2
 from docx import Document as DocxDocument
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -99,82 +102,24 @@ def extract_text_from_docx(file_path: str) -> tuple[str, Dict[str, Any]]:
         logger.error(f"DOCX extraction error: {str(e)}")
         raise Exception(f"Failed to extract text from DOCX: {str(e)}")
-def simple_text_splitter(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
-    """Simple text splitter without external dependencies"""
-    if not text:
-        return []
-    # Split by common separators in order of preference
-    separators = ["\n\n", "\n", ". ", "! ", "? ", " "]
-    def split_text_recursive(text: str, separators: List[str]) -> List[str]:
-        if not separators:
-            # If no separators left, split by character count
-            chunks = []
-            for i in range(0, len(text), chunk_size - chunk_overlap):
-                chunk = text[i:i + chunk_size]
-                if chunk.strip():
-                    chunks.append(chunk.strip())
-            return chunks
-        separator = separators[0]
-        remaining_separators = separators[1:]
-        splits = text.split(separator)
-        chunks = []
-        current_chunk = ""
-        for split in splits:
-            # If adding this split would exceed chunk_size
-            if len(current_chunk) + len(split) + len(separator) > chunk_size:
-                if current_chunk:
-                    # If current chunk is still too big, recursively split it
-                    if len(current_chunk) > chunk_size:
-                        sub_chunks = split_text_recursive(current_chunk, remaining_separators)
-                        chunks.extend(sub_chunks)
-                    else:
-                        chunks.append(current_chunk.strip())
-                current_chunk = split
-            else:
-                if current_chunk:
-                    current_chunk += separator + split
-                else:
-                    current_chunk = split
-        # Add the last chunk
-        if current_chunk:
-            if len(current_chunk) > chunk_size:
-                sub_chunks = split_text_recursive(current_chunk, remaining_separators)
-                chunks.extend(sub_chunks)
-            else:
-                chunks.append(current_chunk.strip())
-        return chunks
-    # Split the text
-    initial_chunks = split_text_recursive(text, separators)
-    # Add overlap between chunks
-    final_chunks = []
-    for i, chunk in enumerate(initial_chunks):
-        if i > 0 and chunk_overlap > 0:
-            # Add overlap from previous chunk
-            prev_chunk = initial_chunks[i-1]
-            overlap = prev_chunk[-chunk_overlap:] if len(prev_chunk) > chunk_overlap else prev_chunk
-            chunk = overlap + " " + chunk
-        final_chunks.append(chunk)
-    return [chunk for chunk in final_chunks if chunk.strip()]
 def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
-    """Clean text and split into chunks"""
     # Basic text cleaning
     text = re.sub(r'\n+', '\n', text)  # Remove multiple newlines
     text = re.sub(r'\s+', ' ', text)   # Remove multiple spaces
     text = text.strip()
-    # Split text into chunks using simple splitter
-    chunks = simple_text_splitter(text, chunk_size=500, chunk_overlap=50)
     # Create DocumentChunk objects
     document_chunks = []
@@ -186,7 +131,8 @@ def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
             metadata={
                 "chunk_index": i,
                 "chunk_length": len(chunk_text),
-                "created_at": datetime.now().isoformat()
             }
         )
         document_chunks.append(chunk)
@@ -300,13 +246,13 @@ def gradio_upload_and_process(file):
         # Format response for Gradio
         response_text = f"""
-✅ Document processed successfully!
-📄 Document ID: {result.doc_id}
-📊 Chunks created: {result.chunks_indexed}
-⏱️ Processing time: {result.metadata['processing_time']:.2f}s
-📝 Total text length: {result.metadata['total_text_length']} characters
-📑 File type: {result.metadata['file_type']}
 Status: {result.status}
 """
@@ -315,11 +261,10 @@ Status: {result.status}
         chunks = DOCUMENT_STORE.get(result.doc_id, [])
         chunks_display = ""
         if chunks:
-            chunks_display = "📄 Processed Chunks:\n\n"
-            for i, chunk in enumerate(chunks[:10]):  # Show first 10 chunks
                 chunks_display += f"--- Chunk {i+1} ---\n"
                 chunks_display += f"Length: {len(chunk.content)} characters\n"
-                chunks_display += f"Content: {chunk.content[:200]}{'...' if len(chunk.content) > 200 else ''}\n\n"
             if len(chunks) > 10:
                 chunks_display += f"... and {len(chunks) - 10} more chunks\n"

 import PyPDF2
 from docx import Document as DocxDocument
+# LangChain imports for better text chunking
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
         logger.error(f"DOCX extraction error: {str(e)}")
         raise Exception(f"Failed to extract text from DOCX: {str(e)}")
 def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
+    """Clean text and split into chunks using LangChain RecursiveCharacterTextSplitter"""
     # Basic text cleaning
     text = re.sub(r'\n+', '\n', text)  # Remove multiple newlines
     text = re.sub(r'\s+', ' ', text)   # Remove multiple spaces
     text = text.strip()
+    # Initialize RecursiveCharacterTextSplitter with better parameters
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=700,           # Target chunk size
+        chunk_overlap=50,         # Overlap between chunks
+        length_function=len,      # Function to measure text length
+        separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""],  # Priority order of separators
+        is_separator_regex=False  # Use literal separators, not regex
+    )
+    # Split text into chunks using LangChain
+    chunks = text_splitter.split_text(text)
     # Create DocumentChunk objects
     document_chunks = []
             metadata={
                 "chunk_index": i,
                 "chunk_length": len(chunk_text),
+                "created_at": datetime.now().isoformat(),
+                "chunking_method": "langchain_recursive_splitter"
             }
         )
         document_chunks.append(chunk)
         # Format response for Gradio
         response_text = f"""
+Document processed successfully!
+Document ID: {result.doc_id}
+Chunks created: {result.chunks_indexed}
+Processing time: {result.metadata['processing_time']:.2f}s
+Total text length: {result.metadata['total_text_length']} characters
+File type: {result.metadata['file_type']}
 Status: {result.status}
 """
         chunks = DOCUMENT_STORE.get(result.doc_id, [])
         chunks_display = ""
         if chunks:
+            for i, chunk in enumerate(chunks):  # Show first 10 chunks
                 chunks_display += f"--- Chunk {i+1} ---\n"
                 chunks_display += f"Length: {len(chunk.content)} characters\n"
+                chunks_display += f"Content: {chunk.content}\n\n"
             if len(chunks) > 10:
                 chunks_display += f"... and {len(chunks) - 10} more chunks\n"

requirements.txt CHANGED Viewed

@@ -8,6 +8,9 @@ python-multipart>=0.0.9
 PyPDF2==3.0.1
 python-docx==1.1.0
 # Utilities
 python-dotenv==1.0.0

 PyPDF2==3.0.1
 python-docx==1.1.0
+# LangChain text splitters (standalone package)
+langchain-text-splitters==0.0.1
 # Utilities
 python-dotenv==1.0.0