yangdx commited on
Commit
bfb441a
·
1 Parent(s): 8db9467

Improved file handling and validation for document processing

Browse files

• Enhanced UTF-8 validation for text files
• Added content validation checks
• Better handling of binary data
• Added logging for ignored document IDs
• Improved document ID filtering

lightrag/api/routers/document_routes.py CHANGED
@@ -215,7 +215,27 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
215
  | ".scss"
216
  | ".less"
217
  ):
218
- content = file.decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  case ".pdf":
220
  if not pm.is_installed("pypdf2"):
221
  pm.install("pypdf2")
@@ -229,7 +249,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
229
  case ".docx":
230
  if not pm.is_installed("docx"):
231
  pm.install("docx")
232
- from docx import Document
233
  from io import BytesIO
234
 
235
  docx_file = BytesIO(file)
@@ -238,7 +258,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
238
  case ".pptx":
239
  if not pm.is_installed("pptx"):
240
  pm.install("pptx")
241
- from pptx import Presentation
242
  from io import BytesIO
243
 
244
  pptx_file = BytesIO(file)
@@ -250,7 +270,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
250
  case ".xlsx":
251
  if not pm.is_installed("openpyxl"):
252
  pm.install("openpyxl")
253
- from openpyxl import load_workbook
254
  from io import BytesIO
255
 
256
  xlsx_file = BytesIO(file)
 
215
  | ".scss"
216
  | ".less"
217
  ):
218
+ try:
219
+ # Try to decode as UTF-8
220
+ content = file.decode("utf-8")
221
+
222
+ # Validate content
223
+ if not content or len(content.strip()) == 0:
224
+ logger.error(f"Empty content in file: {file_path.name}")
225
+ return False
226
+
227
+ # Check if content looks like binary data string representation
228
+ if content.startswith("b'") or content.startswith('b"'):
229
+ logger.error(
230
+ f"File {file_path.name} appears to contain binary data representation instead of text"
231
+ )
232
+ return False
233
+
234
+ except UnicodeDecodeError:
235
+ logger.error(
236
+ f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing."
237
+ )
238
+ return False
239
  case ".pdf":
240
  if not pm.is_installed("pypdf2"):
241
  pm.install("pypdf2")
 
249
  case ".docx":
250
  if not pm.is_installed("docx"):
251
  pm.install("docx")
252
+ from docx import Document # type: ignore
253
  from io import BytesIO
254
 
255
  docx_file = BytesIO(file)
 
258
  case ".pptx":
259
  if not pm.is_installed("pptx"):
260
  pm.install("pptx")
261
+ from pptx import Presentation # type: ignore
262
  from io import BytesIO
263
 
264
  pptx_file = BytesIO(file)
 
270
  case ".xlsx":
271
  if not pm.is_installed("openpyxl"):
272
  pm.install("openpyxl")
273
+ from openpyxl import load_workbook # type: ignore
274
  from io import BytesIO
275
 
276
  xlsx_file = BytesIO(file)
lightrag/lightrag.py CHANGED
@@ -670,8 +670,24 @@ class LightRAG:
670
  all_new_doc_ids = set(new_docs.keys())
671
  # Exclude IDs of documents that are already in progress
672
  unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
 
 
 
 
 
 
 
 
 
 
 
 
673
  # Filter new_docs to only include documents with unique IDs
674
- new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids}
 
 
 
 
675
 
676
  if not new_docs:
677
  logger.info("No new unique documents were found.")
 
670
  all_new_doc_ids = set(new_docs.keys())
671
  # Exclude IDs of documents that are already in progress
672
  unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
673
+
674
+ # Log ignored document IDs
675
+ ignored_ids = [
676
+ doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs
677
+ ]
678
+ if ignored_ids:
679
+ logger.warning(
680
+ f"Ignoring {len(ignored_ids)} document IDs not found in new_docs"
681
+ )
682
+ for doc_id in ignored_ids:
683
+ logger.warning(f"Ignored document ID: {doc_id}")
684
+
685
  # Filter new_docs to only include documents with unique IDs
686
+ new_docs = {
687
+ doc_id: new_docs[doc_id]
688
+ for doc_id in unique_new_doc_ids
689
+ if doc_id in new_docs
690
+ }
691
 
692
  if not new_docs:
693
  logger.info("No new unique documents were found.")