heymenn commited on
Commit
a2e557e
1 Parent(s): 69bb9c0

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +93 -91
split_files_to_excel.py CHANGED
@@ -527,101 +527,103 @@ def split_by_keywords(files, key_words,words_limit=1000):
527
 
528
  tabLine = []
529
  for file in files:
530
-
531
- file = PdfReader(file)
532
- pdfNumberPages = len(file.pages)
533
- for pdfPage in range(0, pdfNumberPages):
534
-
535
- load_page = file.get_page(pdfPage)
536
- text = load_page.extract_text()
537
- lines = text.split("\n")
538
- sizeOfLines = len(lines) - 1
539
-
540
- for index, line in enumerate(lines):
541
- print(line)
542
- for key in key_words:
543
- line = line.lower()
544
-
545
- if key in line:
546
- print("Found keyword")
547
-
548
- # Init variables for search
549
- lineBool = True
550
- lineIndex = index
551
- previousSelectedLines = []
552
- stringLength = 0
553
- linesForSelection = lines
554
- loadOnce = True
555
- selectedPdfPage = pdfPage
556
-
557
- # Loop while extracting text before keyword
558
- while lineBool:
559
- print(lineIndex)
560
- if stringLength > words_limit or lineIndex < 0:
561
- lineBool = False
562
- else:
563
- if lineIndex == 0:
564
- if pdfPage == 0:
565
- lineBool = False
566
-
567
- # Load previous page
568
- else:
569
- try:
570
- selectedPdfPage -= 1
571
- newLoad_page = file.get_page(selectedPdfPage)
572
- newText = newLoad_page.extract_text()
573
- newLines = newText.split("\n")
574
- linesForSelection = newLines
575
- lineIndex = len(newLines) - 1
576
-
577
- except Exception as e:
578
- print(f"Loading previous PDF page failed")
579
  lineBool = False
580
-
581
- previousSelectedLines.append(linesForSelection[lineIndex])
582
- stringLength += len(linesForSelection[lineIndex])
583
-
584
- lineIndex -= 1
585
- previousSelectedLines = ' '.join(previousSelectedLines[::-1])
586
-
587
- # Init variables for search
588
- lineBool = True
589
- lineIndex = index + 1
590
- nextSelectedLines = ""
591
- linesForSelection = lines
592
- loadOnce = True
593
- selectedPdfPage = pdfPage
594
-
595
- # Loop while extracting text after keyword
596
- while lineBool:
597
-
598
- if len(nextSelectedLines.split()) > words_limit:
599
- lineBool = False
600
- else:
601
- if lineIndex > sizeOfLines:
 
 
 
 
 
 
 
 
 
 
 
602
  lineBool = False
603
-
604
- if pdfPage == pdfNumberPages - 1:
605
  lineBool = False
606
-
607
- # Load next page
608
- else:
609
- try:
610
- selectedPdfPage += 1
611
- newLoad_page = file.get_page(selectedPdfPage)
612
- newText = newLoad_page.extract_text()
613
- newLines = newText.split("\n")
614
- linesForSelection = newLines
615
- lineIndex = 0
616
- except Exception as e:
617
- print(f"Loading next PDF page failed")
618
  lineBool = False
619
- else:
620
- nextSelectedLines += " " + linesForSelection[lineIndex]
621
- lineIndex += 1
622
-
623
- selectedText = previousSelectedLines + ' ' + nextSelectedLines
624
- tabLine.append([pdfPage, selectedText, key])
 
 
 
 
 
 
 
 
 
 
 
 
 
625
 
626
  for r in tabLine:
627
  text_joined = ''.join(r[1])
 
527
 
528
  tabLine = []
529
  for file in files:
530
+
531
+ if file.endswith('pdf'):
532
+
533
+ file = PdfReader(file)
534
+ pdfNumberPages = len(file.pages)
535
+ for pdfPage in range(0, pdfNumberPages):
536
+
537
+ load_page = file.get_page(pdfPage)
538
+ text = load_page.extract_text()
539
+ lines = text.split("\n")
540
+ sizeOfLines = len(lines) - 1
541
+
542
+ for index, line in enumerate(lines):
543
+ print(line)
544
+ for key in key_words:
545
+ line = line.lower()
546
+
547
+ if key in line:
548
+ print("Found keyword")
549
+
550
+ # Init variables for search
551
+ lineBool = True
552
+ lineIndex = index
553
+ previousSelectedLines = []
554
+ stringLength = 0
555
+ linesForSelection = lines
556
+ loadOnce = True
557
+ selectedPdfPage = pdfPage
558
+
559
+ # Loop while extracting text before keyword
560
+ while lineBool:
561
+ print(lineIndex)
562
+ if stringLength > words_limit or lineIndex < 0:
563
+ lineBool = False
564
+ else:
565
+ if lineIndex == 0:
566
+ if pdfPage == 0:
 
 
 
 
 
 
 
 
 
 
 
 
567
  lineBool = False
568
+
569
+ # Load previous page
570
+ else:
571
+ try:
572
+ selectedPdfPage -= 1
573
+ newLoad_page = file.get_page(selectedPdfPage)
574
+ newText = newLoad_page.extract_text()
575
+ newLines = newText.split("\n")
576
+ linesForSelection = newLines
577
+ lineIndex = len(newLines) - 1
578
+
579
+ except Exception as e:
580
+ print(f"Loading previous PDF page failed")
581
+ lineBool = False
582
+
583
+ previousSelectedLines.append(linesForSelection[lineIndex])
584
+ stringLength += len(linesForSelection[lineIndex])
585
+
586
+ lineIndex -= 1
587
+ previousSelectedLines = ' '.join(previousSelectedLines[::-1])
588
+
589
+ # Init variables for search
590
+ lineBool = True
591
+ lineIndex = index + 1
592
+ nextSelectedLines = ""
593
+ linesForSelection = lines
594
+ loadOnce = True
595
+ selectedPdfPage = pdfPage
596
+
597
+ # Loop while extracting text after keyword
598
+ while lineBool:
599
+
600
+ if len(nextSelectedLines.split()) > words_limit:
601
  lineBool = False
602
+ else:
603
+ if lineIndex > sizeOfLines:
604
  lineBool = False
605
+
606
+ if pdfPage == pdfNumberPages - 1:
 
 
 
 
 
 
 
 
 
 
607
  lineBool = False
608
+
609
+ # Load next page
610
+ else:
611
+ try:
612
+ selectedPdfPage += 1
613
+ newLoad_page = file.get_page(selectedPdfPage)
614
+ newText = newLoad_page.extract_text()
615
+ newLines = newText.split("\n")
616
+ linesForSelection = newLines
617
+ lineIndex = 0
618
+ except Exception as e:
619
+ print(f"Loading next PDF page failed")
620
+ lineBool = False
621
+ else:
622
+ nextSelectedLines += " " + linesForSelection[lineIndex]
623
+ lineIndex += 1
624
+
625
+ selectedText = previousSelectedLines + ' ' + nextSelectedLines
626
+ tabLine.append([pdfPage, selectedText, key])
627
 
628
  for r in tabLine:
629
  text_joined = ''.join(r[1])