mtyrrell commited on
Commit
698f034
Β·
1 Parent(s): 537051a

recursive chunking

Browse files
Files changed (2) hide show
  1. app/main.py +25 -80
  2. requirements.txt +3 -0
app/main.py CHANGED
@@ -16,6 +16,9 @@ from pathlib import Path
16
  import PyPDF2
17
  from docx import Document as DocxDocument
18
 
 
 
 
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
  logger = logging.getLogger(__name__)
@@ -99,82 +102,24 @@ def extract_text_from_docx(file_path: str) -> tuple[str, Dict[str, Any]]:
99
  logger.error(f"DOCX extraction error: {str(e)}")
100
  raise Exception(f"Failed to extract text from DOCX: {str(e)}")
101
 
102
- def simple_text_splitter(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
103
- """Simple text splitter without external dependencies"""
104
- if not text:
105
- return []
106
-
107
- # Split by common separators in order of preference
108
- separators = ["\n\n", "\n", ". ", "! ", "? ", " "]
109
-
110
- def split_text_recursive(text: str, separators: List[str]) -> List[str]:
111
- if not separators:
112
- # If no separators left, split by character count
113
- chunks = []
114
- for i in range(0, len(text), chunk_size - chunk_overlap):
115
- chunk = text[i:i + chunk_size]
116
- if chunk.strip():
117
- chunks.append(chunk.strip())
118
- return chunks
119
-
120
- separator = separators[0]
121
- remaining_separators = separators[1:]
122
-
123
- splits = text.split(separator)
124
- chunks = []
125
- current_chunk = ""
126
-
127
- for split in splits:
128
- # If adding this split would exceed chunk_size
129
- if len(current_chunk) + len(split) + len(separator) > chunk_size:
130
- if current_chunk:
131
- # If current chunk is still too big, recursively split it
132
- if len(current_chunk) > chunk_size:
133
- sub_chunks = split_text_recursive(current_chunk, remaining_separators)
134
- chunks.extend(sub_chunks)
135
- else:
136
- chunks.append(current_chunk.strip())
137
- current_chunk = split
138
- else:
139
- if current_chunk:
140
- current_chunk += separator + split
141
- else:
142
- current_chunk = split
143
-
144
- # Add the last chunk
145
- if current_chunk:
146
- if len(current_chunk) > chunk_size:
147
- sub_chunks = split_text_recursive(current_chunk, remaining_separators)
148
- chunks.extend(sub_chunks)
149
- else:
150
- chunks.append(current_chunk.strip())
151
-
152
- return chunks
153
-
154
- # Split the text
155
- initial_chunks = split_text_recursive(text, separators)
156
-
157
- # Add overlap between chunks
158
- final_chunks = []
159
- for i, chunk in enumerate(initial_chunks):
160
- if i > 0 and chunk_overlap > 0:
161
- # Add overlap from previous chunk
162
- prev_chunk = initial_chunks[i-1]
163
- overlap = prev_chunk[-chunk_overlap:] if len(prev_chunk) > chunk_overlap else prev_chunk
164
- chunk = overlap + " " + chunk
165
- final_chunks.append(chunk)
166
-
167
- return [chunk for chunk in final_chunks if chunk.strip()]
168
-
169
  def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
170
- """Clean text and split into chunks"""
171
  # Basic text cleaning
172
  text = re.sub(r'\n+', '\n', text) # Remove multiple newlines
173
  text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
174
  text = text.strip()
175
 
176
- # Split text into chunks using simple splitter
177
- chunks = simple_text_splitter(text, chunk_size=500, chunk_overlap=50)
 
 
 
 
 
 
 
 
 
178
 
179
  # Create DocumentChunk objects
180
  document_chunks = []
@@ -186,7 +131,8 @@ def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
186
  metadata={
187
  "chunk_index": i,
188
  "chunk_length": len(chunk_text),
189
- "created_at": datetime.now().isoformat()
 
190
  }
191
  )
192
  document_chunks.append(chunk)
@@ -300,13 +246,13 @@ def gradio_upload_and_process(file):
300
 
301
  # Format response for Gradio
302
  response_text = f"""
303
- βœ… Document processed successfully!
304
 
305
- πŸ“„ Document ID: {result.doc_id}
306
- πŸ“Š Chunks created: {result.chunks_indexed}
307
- ⏱️ Processing time: {result.metadata['processing_time']:.2f}s
308
- πŸ“ Total text length: {result.metadata['total_text_length']} characters
309
- πŸ“‘ File type: {result.metadata['file_type']}
310
 
311
  Status: {result.status}
312
  """
@@ -315,11 +261,10 @@ Status: {result.status}
315
  chunks = DOCUMENT_STORE.get(result.doc_id, [])
316
  chunks_display = ""
317
  if chunks:
318
- chunks_display = "πŸ“„ Processed Chunks:\n\n"
319
- for i, chunk in enumerate(chunks[:10]): # Show first 10 chunks
320
  chunks_display += f"--- Chunk {i+1} ---\n"
321
  chunks_display += f"Length: {len(chunk.content)} characters\n"
322
- chunks_display += f"Content: {chunk.content[:200]}{'...' if len(chunk.content) > 200 else ''}\n\n"
323
 
324
  if len(chunks) > 10:
325
  chunks_display += f"... and {len(chunks) - 10} more chunks\n"
 
16
  import PyPDF2
17
  from docx import Document as DocxDocument
18
 
19
+ # LangChain imports for better text chunking
20
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
21
+
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
  logger = logging.getLogger(__name__)
 
102
  logger.error(f"DOCX extraction error: {str(e)}")
103
  raise Exception(f"Failed to extract text from DOCX: {str(e)}")
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  def clean_and_chunk_text(text: str, doc_id: str) -> List[DocumentChunk]:
106
+ """Clean text and split into chunks using LangChain RecursiveCharacterTextSplitter"""
107
  # Basic text cleaning
108
  text = re.sub(r'\n+', '\n', text) # Remove multiple newlines
109
  text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
110
  text = text.strip()
111
 
112
+ # Initialize RecursiveCharacterTextSplitter with better parameters
113
+ text_splitter = RecursiveCharacterTextSplitter(
114
+ chunk_size=700, # Target chunk size
115
+ chunk_overlap=50, # Overlap between chunks
116
+ length_function=len, # Function to measure text length
117
+ separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""], # Priority order of separators
118
+ is_separator_regex=False # Use literal separators, not regex
119
+ )
120
+
121
+ # Split text into chunks using LangChain
122
+ chunks = text_splitter.split_text(text)
123
 
124
  # Create DocumentChunk objects
125
  document_chunks = []
 
131
  metadata={
132
  "chunk_index": i,
133
  "chunk_length": len(chunk_text),
134
+ "created_at": datetime.now().isoformat(),
135
+ "chunking_method": "langchain_recursive_splitter"
136
  }
137
  )
138
  document_chunks.append(chunk)
 
246
 
247
  # Format response for Gradio
248
  response_text = f"""
249
+ Document processed successfully!
250
 
251
+ Document ID: {result.doc_id}
252
+ Chunks created: {result.chunks_indexed}
253
+ Processing time: {result.metadata['processing_time']:.2f}s
254
+ Total text length: {result.metadata['total_text_length']} characters
255
+ File type: {result.metadata['file_type']}
256
 
257
  Status: {result.status}
258
  """
 
261
  chunks = DOCUMENT_STORE.get(result.doc_id, [])
262
  chunks_display = ""
263
  if chunks:
264
+ for i, chunk in enumerate(chunks): # Show first 10 chunks
 
265
  chunks_display += f"--- Chunk {i+1} ---\n"
266
  chunks_display += f"Length: {len(chunk.content)} characters\n"
267
+ chunks_display += f"Content: {chunk.content}\n\n"
268
 
269
  if len(chunks) > 10:
270
  chunks_display += f"... and {len(chunks) - 10} more chunks\n"
requirements.txt CHANGED
@@ -8,6 +8,9 @@ python-multipart>=0.0.9
8
  PyPDF2==3.0.1
9
  python-docx==1.1.0
10
 
 
 
 
11
  # Utilities
12
  python-dotenv==1.0.0
13
 
 
8
  PyPDF2==3.0.1
9
  python-docx==1.1.0
10
 
11
+ # LangChain text splitters (standalone package)
12
+ langchain-text-splitters==0.0.1
13
+
14
  # Utilities
15
  python-dotenv==1.0.0
16