yangdx
commited on
Commit
·
bfb441a
1
Parent(s):
8db9467
Improved file handling and validation for document processing
Browse files• Enhanced UTF-8 validation for text files
• Added content validation checks
• Better handling of binary data
• Added logging for ignored document IDs
• Improved document ID filtering
- lightrag/api/routers/document_routes.py +24 -4
- lightrag/lightrag.py +17 -1
lightrag/api/routers/document_routes.py
CHANGED
|
@@ -215,7 +215,27 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|
| 215 |
| ".scss"
|
| 216 |
| ".less"
|
| 217 |
):
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
case ".pdf":
|
| 220 |
if not pm.is_installed("pypdf2"):
|
| 221 |
pm.install("pypdf2")
|
|
@@ -229,7 +249,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|
| 229 |
case ".docx":
|
| 230 |
if not pm.is_installed("docx"):
|
| 231 |
pm.install("docx")
|
| 232 |
-
from docx import Document
|
| 233 |
from io import BytesIO
|
| 234 |
|
| 235 |
docx_file = BytesIO(file)
|
|
@@ -238,7 +258,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|
| 238 |
case ".pptx":
|
| 239 |
if not pm.is_installed("pptx"):
|
| 240 |
pm.install("pptx")
|
| 241 |
-
from pptx import Presentation
|
| 242 |
from io import BytesIO
|
| 243 |
|
| 244 |
pptx_file = BytesIO(file)
|
|
@@ -250,7 +270,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|
| 250 |
case ".xlsx":
|
| 251 |
if not pm.is_installed("openpyxl"):
|
| 252 |
pm.install("openpyxl")
|
| 253 |
-
from openpyxl import load_workbook
|
| 254 |
from io import BytesIO
|
| 255 |
|
| 256 |
xlsx_file = BytesIO(file)
|
|
|
|
| 215 |
| ".scss"
|
| 216 |
| ".less"
|
| 217 |
):
|
| 218 |
+
try:
|
| 219 |
+
# Try to decode as UTF-8
|
| 220 |
+
content = file.decode("utf-8")
|
| 221 |
+
|
| 222 |
+
# Validate content
|
| 223 |
+
if not content or len(content.strip()) == 0:
|
| 224 |
+
logger.error(f"Empty content in file: {file_path.name}")
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
# Check if content looks like binary data string representation
|
| 228 |
+
if content.startswith("b'") or content.startswith('b"'):
|
| 229 |
+
logger.error(
|
| 230 |
+
f"File {file_path.name} appears to contain binary data representation instead of text"
|
| 231 |
+
)
|
| 232 |
+
return False
|
| 233 |
+
|
| 234 |
+
except UnicodeDecodeError:
|
| 235 |
+
logger.error(
|
| 236 |
+
f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing."
|
| 237 |
+
)
|
| 238 |
+
return False
|
| 239 |
case ".pdf":
|
| 240 |
if not pm.is_installed("pypdf2"):
|
| 241 |
pm.install("pypdf2")
|
|
|
|
| 249 |
case ".docx":
|
| 250 |
if not pm.is_installed("docx"):
|
| 251 |
pm.install("docx")
|
| 252 |
+
from docx import Document # type: ignore
|
| 253 |
from io import BytesIO
|
| 254 |
|
| 255 |
docx_file = BytesIO(file)
|
|
|
|
| 258 |
case ".pptx":
|
| 259 |
if not pm.is_installed("pptx"):
|
| 260 |
pm.install("pptx")
|
| 261 |
+
from pptx import Presentation # type: ignore
|
| 262 |
from io import BytesIO
|
| 263 |
|
| 264 |
pptx_file = BytesIO(file)
|
|
|
|
| 270 |
case ".xlsx":
|
| 271 |
if not pm.is_installed("openpyxl"):
|
| 272 |
pm.install("openpyxl")
|
| 273 |
+
from openpyxl import load_workbook # type: ignore
|
| 274 |
from io import BytesIO
|
| 275 |
|
| 276 |
xlsx_file = BytesIO(file)
|
lightrag/lightrag.py
CHANGED
|
@@ -670,8 +670,24 @@ class LightRAG:
|
|
| 670 |
all_new_doc_ids = set(new_docs.keys())
|
| 671 |
# Exclude IDs of documents that are already in progress
|
| 672 |
unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
# Filter new_docs to only include documents with unique IDs
|
| 674 |
-
new_docs = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
|
| 676 |
if not new_docs:
|
| 677 |
logger.info("No new unique documents were found.")
|
|
|
|
| 670 |
all_new_doc_ids = set(new_docs.keys())
|
| 671 |
# Exclude IDs of documents that are already in progress
|
| 672 |
unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
|
| 673 |
+
|
| 674 |
+
# Log ignored document IDs
|
| 675 |
+
ignored_ids = [
|
| 676 |
+
doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs
|
| 677 |
+
]
|
| 678 |
+
if ignored_ids:
|
| 679 |
+
logger.warning(
|
| 680 |
+
f"Ignoring {len(ignored_ids)} document IDs not found in new_docs"
|
| 681 |
+
)
|
| 682 |
+
for doc_id in ignored_ids:
|
| 683 |
+
logger.warning(f"Ignored document ID: {doc_id}")
|
| 684 |
+
|
| 685 |
# Filter new_docs to only include documents with unique IDs
|
| 686 |
+
new_docs = {
|
| 687 |
+
doc_id: new_docs[doc_id]
|
| 688 |
+
for doc_id in unique_new_doc_ids
|
| 689 |
+
if doc_id in new_docs
|
| 690 |
+
}
|
| 691 |
|
| 692 |
if not new_docs:
|
| 693 |
logger.info("No new unique documents were found.")
|