ParisNeo commited on
Commit
ed548ce
·
1 Parent(s): 91cc9f8

fixed linting

Browse files
lightrag/api/lightrag_server.py CHANGED
@@ -175,7 +175,11 @@ def parse_args():
175
  class DocumentManager:
176
  """Handles document operations and tracking"""
177
 
178
- def __init__(self, input_dir: str, supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx")):
 
 
 
 
179
  self.input_dir = Path(input_dir)
180
  self.supported_extensions = supported_extensions
181
  self.indexed_files = set()
@@ -357,26 +361,22 @@ def create_app(args):
357
  ),
358
  )
359
 
360
-
361
-
362
  async def index_file(file_path: Union[str, Path]) -> None:
363
- """ Index all files inside the folder with support for multiple file formats
364
-
365
  Args:
366
  file_path: Path to the file to be indexed (str or Path object)
367
-
368
  Raises:
369
  ValueError: If file format is not supported
370
  FileNotFoundError: If file doesn't exist
371
  """
372
  if not pm.is_installed("aiofiles"):
373
  pm.install("aiofiles")
374
- import aiofiles
375
-
376
-
377
  # Convert to Path object if string
378
  file_path = Path(file_path)
379
-
380
  # Check if file exists
381
  if not file_path.exists():
382
  raise FileNotFoundError(f"File not found: {file_path}")
@@ -384,23 +384,24 @@ def create_app(args):
384
  content = ""
385
  # Get file extension in lowercase
386
  ext = file_path.suffix.lower()
387
-
388
  match ext:
389
  case ".txt" | ".md":
390
  # Text files handling
391
  async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
392
  content = await f.read()
393
-
394
  case ".pdf":
395
  if not pm.is_installed("pypdf2"):
396
  pm.install("pypdf2")
397
  from pypdf2 import PdfReader
 
398
  # PDF handling
399
  reader = PdfReader(str(file_path))
400
  content = ""
401
  for page in reader.pages:
402
  content += page.extract_text() + "\n"
403
-
404
  case ".docx":
405
  if not pm.is_installed("docx"):
406
  pm.install("docx")
@@ -409,11 +410,12 @@ def create_app(args):
409
  # Word document handling
410
  doc = Document(file_path)
411
  content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
412
-
413
  case ".pptx":
414
  if not pm.is_installed("pptx"):
415
  pm.install("pptx")
416
  from pptx import Presentation
 
417
  # PowerPoint handling
418
  prs = Presentation(file_path)
419
  content = ""
@@ -421,7 +423,7 @@ def create_app(args):
421
  for shape in slide.shapes:
422
  if hasattr(shape, "text"):
423
  content += shape.text + "\n"
424
-
425
  case _:
426
  raise ValueError(f"Unsupported file format: {ext}")
427
 
@@ -433,9 +435,6 @@ def create_app(args):
433
  else:
434
  logging.warning(f"No content extracted from file: {file_path}")
435
 
436
-
437
-
438
-
439
  @app.on_event("startup")
440
  async def startup_event():
441
  """Index all files in input directory during startup"""
@@ -559,6 +558,7 @@ def create_app(args):
559
  )
560
  except Exception as e:
561
  raise HTTPException(status_code=500, detail=str(e))
 
562
  @app.post(
563
  "/documents/file",
564
  response_model=InsertResponse,
@@ -566,14 +566,14 @@ def create_app(args):
566
  )
567
  async def insert_file(file: UploadFile = File(...), description: str = Form(None)):
568
  """Insert a file directly into the RAG system
569
-
570
  Args:
571
  file: Uploaded file
572
  description: Optional description of the file
573
-
574
  Returns:
575
  InsertResponse: Status of the insertion operation
576
-
577
  Raises:
578
  HTTPException: For unsupported file types or processing errors
579
  """
@@ -581,19 +581,19 @@ def create_app(args):
581
  content = ""
582
  # Get file extension in lowercase
583
  ext = Path(file.filename).suffix.lower()
584
-
585
  match ext:
586
  case ".txt" | ".md":
587
  # Text files handling
588
  text_content = await file.read()
589
  content = text_content.decode("utf-8")
590
-
591
  case ".pdf":
592
  if not pm.is_installed("pypdf2"):
593
  pm.install("pypdf2")
594
  from pypdf2 import PdfReader
595
  from io import BytesIO
596
-
597
  # Read PDF from memory
598
  pdf_content = await file.read()
599
  pdf_file = BytesIO(pdf_content)
@@ -601,25 +601,27 @@ def create_app(args):
601
  content = ""
602
  for page in reader.pages:
603
  content += page.extract_text() + "\n"
604
-
605
  case ".docx":
606
  if not pm.is_installed("docx"):
607
  pm.install("docx")
608
  from docx import Document
609
  from io import BytesIO
610
-
611
  # Read DOCX from memory
612
  docx_content = await file.read()
613
  docx_file = BytesIO(docx_content)
614
  doc = Document(docx_file)
615
- content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
616
-
 
 
617
  case ".pptx":
618
  if not pm.is_installed("pptx"):
619
  pm.install("pptx")
620
  from pptx import Presentation
621
  from io import BytesIO
622
-
623
  # Read PPTX from memory
624
  pptx_content = await file.read()
625
  pptx_file = BytesIO(pptx_content)
@@ -629,7 +631,7 @@ def create_app(args):
629
  for shape in slide.shapes:
630
  if hasattr(shape, "text"):
631
  content += shape.text + "\n"
632
-
633
  case _:
634
  raise HTTPException(
635
  status_code=400,
@@ -641,10 +643,10 @@ def create_app(args):
641
  # Add description if provided
642
  if description:
643
  content = f"{description}\n\n{content}"
644
-
645
  await rag.ainsert(content)
646
  logging.info(f"Successfully indexed file: {file.filename}")
647
-
648
  return InsertResponse(
649
  status="success",
650
  message=f"File '{file.filename}' successfully inserted",
@@ -661,6 +663,7 @@ def create_app(args):
661
  except Exception as e:
662
  logging.error(f"Error processing file {file.filename}: {str(e)}")
663
  raise HTTPException(status_code=500, detail=str(e))
 
664
  @app.post(
665
  "/documents/batch",
666
  response_model=InsertResponse,
@@ -668,13 +671,13 @@ def create_app(args):
668
  )
669
  async def insert_batch(files: List[UploadFile] = File(...)):
670
  """Process multiple files in batch mode
671
-
672
  Args:
673
  files: List of files to process
674
-
675
  Returns:
676
  InsertResponse: Status of the batch insertion operation
677
-
678
  Raises:
679
  HTTPException: For processing errors
680
  """
@@ -686,41 +689,43 @@ def create_app(args):
686
  try:
687
  content = ""
688
  ext = Path(file.filename).suffix.lower()
689
-
690
  match ext:
691
  case ".txt" | ".md":
692
  text_content = await file.read()
693
  content = text_content.decode("utf-8")
694
-
695
  case ".pdf":
696
  if not pm.is_installed("pypdf2"):
697
  pm.install("pypdf2")
698
  from pypdf2 import PdfReader
699
  from io import BytesIO
700
-
701
  pdf_content = await file.read()
702
  pdf_file = BytesIO(pdf_content)
703
  reader = PdfReader(pdf_file)
704
  for page in reader.pages:
705
  content += page.extract_text() + "\n"
706
-
707
  case ".docx":
708
  if not pm.is_installed("docx"):
709
  pm.install("docx")
710
  from docx import Document
711
  from io import BytesIO
712
-
713
  docx_content = await file.read()
714
  docx_file = BytesIO(docx_content)
715
  doc = Document(docx_file)
716
- content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
717
-
 
 
718
  case ".pptx":
719
  if not pm.is_installed("pptx"):
720
  pm.install("pptx")
721
  from pptx import Presentation
722
  from io import BytesIO
723
-
724
  pptx_content = await file.read()
725
  pptx_file = BytesIO(pptx_content)
726
  prs = Presentation(pptx_file)
@@ -728,7 +733,7 @@ def create_app(args):
728
  for shape in slide.shapes:
729
  if hasattr(shape, "text"):
730
  content += shape.text + "\n"
731
-
732
  case _:
733
  failed_files.append(f"{file.filename} (unsupported type)")
734
  continue
@@ -771,7 +776,6 @@ def create_app(args):
771
  logging.error(f"Batch processing error: {str(e)}")
772
  raise HTTPException(status_code=500, detail=str(e))
773
 
774
-
775
  @app.delete(
776
  "/documents",
777
  response_model=InsertResponse,
 
175
  class DocumentManager:
176
  """Handles document operations and tracking"""
177
 
178
+ def __init__(
179
+ self,
180
+ input_dir: str,
181
+ supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"),
182
+ ):
183
  self.input_dir = Path(input_dir)
184
  self.supported_extensions = supported_extensions
185
  self.indexed_files = set()
 
361
  ),
362
  )
363
 
 
 
364
  async def index_file(file_path: Union[str, Path]) -> None:
365
+ """Index all files inside the folder with support for multiple file formats
366
+
367
  Args:
368
  file_path: Path to the file to be indexed (str or Path object)
369
+
370
  Raises:
371
  ValueError: If file format is not supported
372
  FileNotFoundError: If file doesn't exist
373
  """
374
  if not pm.is_installed("aiofiles"):
375
  pm.install("aiofiles")
376
+
 
 
377
  # Convert to Path object if string
378
  file_path = Path(file_path)
379
+
380
  # Check if file exists
381
  if not file_path.exists():
382
  raise FileNotFoundError(f"File not found: {file_path}")
 
384
  content = ""
385
  # Get file extension in lowercase
386
  ext = file_path.suffix.lower()
387
+
388
  match ext:
389
  case ".txt" | ".md":
390
  # Text files handling
391
  async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
392
  content = await f.read()
393
+
394
  case ".pdf":
395
  if not pm.is_installed("pypdf2"):
396
  pm.install("pypdf2")
397
  from pypdf2 import PdfReader
398
+
399
  # PDF handling
400
  reader = PdfReader(str(file_path))
401
  content = ""
402
  for page in reader.pages:
403
  content += page.extract_text() + "\n"
404
+
405
  case ".docx":
406
  if not pm.is_installed("docx"):
407
  pm.install("docx")
 
410
  # Word document handling
411
  doc = Document(file_path)
412
  content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
413
+
414
  case ".pptx":
415
  if not pm.is_installed("pptx"):
416
  pm.install("pptx")
417
  from pptx import Presentation
418
+
419
  # PowerPoint handling
420
  prs = Presentation(file_path)
421
  content = ""
 
423
  for shape in slide.shapes:
424
  if hasattr(shape, "text"):
425
  content += shape.text + "\n"
426
+
427
  case _:
428
  raise ValueError(f"Unsupported file format: {ext}")
429
 
 
435
  else:
436
  logging.warning(f"No content extracted from file: {file_path}")
437
 
 
 
 
438
  @app.on_event("startup")
439
  async def startup_event():
440
  """Index all files in input directory during startup"""
 
558
  )
559
  except Exception as e:
560
  raise HTTPException(status_code=500, detail=str(e))
561
+
562
  @app.post(
563
  "/documents/file",
564
  response_model=InsertResponse,
 
566
  )
567
  async def insert_file(file: UploadFile = File(...), description: str = Form(None)):
568
  """Insert a file directly into the RAG system
569
+
570
  Args:
571
  file: Uploaded file
572
  description: Optional description of the file
573
+
574
  Returns:
575
  InsertResponse: Status of the insertion operation
576
+
577
  Raises:
578
  HTTPException: For unsupported file types or processing errors
579
  """
 
581
  content = ""
582
  # Get file extension in lowercase
583
  ext = Path(file.filename).suffix.lower()
584
+
585
  match ext:
586
  case ".txt" | ".md":
587
  # Text files handling
588
  text_content = await file.read()
589
  content = text_content.decode("utf-8")
590
+
591
  case ".pdf":
592
  if not pm.is_installed("pypdf2"):
593
  pm.install("pypdf2")
594
  from pypdf2 import PdfReader
595
  from io import BytesIO
596
+
597
  # Read PDF from memory
598
  pdf_content = await file.read()
599
  pdf_file = BytesIO(pdf_content)
 
601
  content = ""
602
  for page in reader.pages:
603
  content += page.extract_text() + "\n"
604
+
605
  case ".docx":
606
  if not pm.is_installed("docx"):
607
  pm.install("docx")
608
  from docx import Document
609
  from io import BytesIO
610
+
611
  # Read DOCX from memory
612
  docx_content = await file.read()
613
  docx_file = BytesIO(docx_content)
614
  doc = Document(docx_file)
615
+ content = "\n".join(
616
+ [paragraph.text for paragraph in doc.paragraphs]
617
+ )
618
+
619
  case ".pptx":
620
  if not pm.is_installed("pptx"):
621
  pm.install("pptx")
622
  from pptx import Presentation
623
  from io import BytesIO
624
+
625
  # Read PPTX from memory
626
  pptx_content = await file.read()
627
  pptx_file = BytesIO(pptx_content)
 
631
  for shape in slide.shapes:
632
  if hasattr(shape, "text"):
633
  content += shape.text + "\n"
634
+
635
  case _:
636
  raise HTTPException(
637
  status_code=400,
 
643
  # Add description if provided
644
  if description:
645
  content = f"{description}\n\n{content}"
646
+
647
  await rag.ainsert(content)
648
  logging.info(f"Successfully indexed file: {file.filename}")
649
+
650
  return InsertResponse(
651
  status="success",
652
  message=f"File '{file.filename}' successfully inserted",
 
663
  except Exception as e:
664
  logging.error(f"Error processing file {file.filename}: {str(e)}")
665
  raise HTTPException(status_code=500, detail=str(e))
666
+
667
  @app.post(
668
  "/documents/batch",
669
  response_model=InsertResponse,
 
671
  )
672
  async def insert_batch(files: List[UploadFile] = File(...)):
673
  """Process multiple files in batch mode
674
+
675
  Args:
676
  files: List of files to process
677
+
678
  Returns:
679
  InsertResponse: Status of the batch insertion operation
680
+
681
  Raises:
682
  HTTPException: For processing errors
683
  """
 
689
  try:
690
  content = ""
691
  ext = Path(file.filename).suffix.lower()
692
+
693
  match ext:
694
  case ".txt" | ".md":
695
  text_content = await file.read()
696
  content = text_content.decode("utf-8")
697
+
698
  case ".pdf":
699
  if not pm.is_installed("pypdf2"):
700
  pm.install("pypdf2")
701
  from pypdf2 import PdfReader
702
  from io import BytesIO
703
+
704
  pdf_content = await file.read()
705
  pdf_file = BytesIO(pdf_content)
706
  reader = PdfReader(pdf_file)
707
  for page in reader.pages:
708
  content += page.extract_text() + "\n"
709
+
710
  case ".docx":
711
  if not pm.is_installed("docx"):
712
  pm.install("docx")
713
  from docx import Document
714
  from io import BytesIO
715
+
716
  docx_content = await file.read()
717
  docx_file = BytesIO(docx_content)
718
  doc = Document(docx_file)
719
+ content = "\n".join(
720
+ [paragraph.text for paragraph in doc.paragraphs]
721
+ )
722
+
723
  case ".pptx":
724
  if not pm.is_installed("pptx"):
725
  pm.install("pptx")
726
  from pptx import Presentation
727
  from io import BytesIO
728
+
729
  pptx_content = await file.read()
730
  pptx_file = BytesIO(pptx_content)
731
  prs = Presentation(pptx_file)
 
733
  for shape in slide.shapes:
734
  if hasattr(shape, "text"):
735
  content += shape.text + "\n"
736
+
737
  case _:
738
  failed_files.append(f"{file.filename} (unsupported type)")
739
  continue
 
776
  logging.error(f"Batch processing error: {str(e)}")
777
  raise HTTPException(status_code=500, detail=str(e))
778
 
 
779
  @app.delete(
780
  "/documents",
781
  response_model=InsertResponse,
lightrag/api/requirements.txt CHANGED
@@ -7,6 +7,7 @@ nest_asyncio
7
  numpy
8
  ollama
9
  openai
 
10
  python-dotenv
11
  python-multipart
12
  tenacity
@@ -15,4 +16,3 @@ torch
15
  tqdm
16
  transformers
17
  uvicorn
18
- pipmaster
 
7
  numpy
8
  ollama
9
  openai
10
+ pipmaster
11
  python-dotenv
12
  python-multipart
13
  tenacity
 
16
  tqdm
17
  transformers
18
  uvicorn