LarFii commited on
Commit
ccc1a22
·
2 Parent(s): 2a07cc4 2deecf6

Merge branch 'main' of https://github.com/HKUDS/LightRAG

Browse files
MANIFEST.in ADDED
@@ -0,0 +1 @@
 
 
1
+ recursive-include lightrag/api/webui *
lightrag/api/lightrag_server.py CHANGED
@@ -6,7 +6,6 @@ from fastapi import (
6
  FastAPI,
7
  Depends,
8
  )
9
- from fastapi.responses import FileResponse
10
  import asyncio
11
  import os
12
  import logging
@@ -408,10 +407,6 @@ def create_app(args):
408
  name="webui",
409
  )
410
 
411
- @app.get("/webui/")
412
- async def webui_root():
413
- return FileResponse(static_dir / "index.html")
414
-
415
  return app
416
 
417
 
 
6
  FastAPI,
7
  Depends,
8
  )
 
9
  import asyncio
10
  import os
11
  import logging
 
407
  name="webui",
408
  )
409
 
 
 
 
 
410
  return app
411
 
412
 
lightrag/api/routers/document_routes.py CHANGED
@@ -215,9 +215,29 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
215
  | ".scss"
216
  | ".less"
217
  ):
218
- content = file.decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  case ".pdf":
220
- if not pm.is_installed("pypdf2"):
221
  pm.install("pypdf2")
222
  from PyPDF2 import PdfReader # type: ignore
223
  from io import BytesIO
@@ -227,18 +247,18 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
227
  for page in reader.pages:
228
  content += page.extract_text() + "\n"
229
  case ".docx":
230
- if not pm.is_installed("docx"):
231
  pm.install("docx")
232
- from docx import Document
233
  from io import BytesIO
234
 
235
  docx_file = BytesIO(file)
236
  doc = Document(docx_file)
237
  content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
238
  case ".pptx":
239
- if not pm.is_installed("pptx"):
240
  pm.install("pptx")
241
- from pptx import Presentation
242
  from io import BytesIO
243
 
244
  pptx_file = BytesIO(file)
@@ -248,9 +268,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
248
  if hasattr(shape, "text"):
249
  content += shape.text + "\n"
250
  case ".xlsx":
251
- if not pm.is_installed("openpyxl"):
252
  pm.install("openpyxl")
253
- from openpyxl import load_workbook
254
  from io import BytesIO
255
 
256
  xlsx_file = BytesIO(file)
 
215
  | ".scss"
216
  | ".less"
217
  ):
218
+ try:
219
+ # Try to decode as UTF-8
220
+ content = file.decode("utf-8")
221
+
222
+ # Validate content
223
+ if not content or len(content.strip()) == 0:
224
+ logger.error(f"Empty content in file: {file_path.name}")
225
+ return False
226
+
227
+ # Check if content looks like binary data string representation
228
+ if content.startswith("b'") or content.startswith('b"'):
229
+ logger.error(
230
+ f"File {file_path.name} appears to contain binary data representation instead of text"
231
+ )
232
+ return False
233
+
234
+ except UnicodeDecodeError:
235
+ logger.error(
236
+ f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing."
237
+ )
238
+ return False
239
  case ".pdf":
240
+ if not pm.is_installed("pypdf2"): # type: ignore
241
  pm.install("pypdf2")
242
  from PyPDF2 import PdfReader # type: ignore
243
  from io import BytesIO
 
247
  for page in reader.pages:
248
  content += page.extract_text() + "\n"
249
  case ".docx":
250
+ if not pm.is_installed("python-docx"): # type: ignore
251
  pm.install("docx")
252
+ from docx import Document # type: ignore
253
  from io import BytesIO
254
 
255
  docx_file = BytesIO(file)
256
  doc = Document(docx_file)
257
  content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
258
  case ".pptx":
259
+ if not pm.is_installed("python-pptx"): # type: ignore
260
  pm.install("pptx")
261
+ from pptx import Presentation # type: ignore
262
  from io import BytesIO
263
 
264
  pptx_file = BytesIO(file)
 
268
  if hasattr(shape, "text"):
269
  content += shape.text + "\n"
270
  case ".xlsx":
271
+ if not pm.is_installed("openpyxl"): # type: ignore
272
  pm.install("openpyxl")
273
+ from openpyxl import load_workbook # type: ignore
274
  from io import BytesIO
275
 
276
  xlsx_file = BytesIO(file)
lightrag/lightrag.py CHANGED
@@ -685,8 +685,24 @@ class LightRAG:
685
  all_new_doc_ids = set(new_docs.keys())
686
  # Exclude IDs of documents that are already in progress
687
  unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
 
 
 
 
 
 
 
 
 
 
 
 
688
  # Filter new_docs to only include documents with unique IDs
689
- new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids}
 
 
 
 
690
 
691
  if not new_docs:
692
  logger.info("No new unique documents were found.")
 
685
  all_new_doc_ids = set(new_docs.keys())
686
  # Exclude IDs of documents that are already in progress
687
  unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
688
+
689
+ # Log ignored document IDs
690
+ ignored_ids = [
691
+ doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs
692
+ ]
693
+ if ignored_ids:
694
+ logger.warning(
695
+ f"Ignoring {len(ignored_ids)} document IDs not found in new_docs"
696
+ )
697
+ for doc_id in ignored_ids:
698
+ logger.warning(f"Ignored document ID: {doc_id}")
699
+
700
  # Filter new_docs to only include documents with unique IDs
701
+ new_docs = {
702
+ doc_id: new_docs[doc_id]
703
+ for doc_id in unique_new_doc_ids
704
+ if doc_id in new_docs
705
+ }
706
 
707
  if not new_docs:
708
  logger.info("No new unique documents were found.")